{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": 1,
   "metadata": {
    "ExecuteTime": {
     "end_time": "2018-12-28T02:16:30.922402Z",
     "start_time": "2018-12-28T02:16:30.638781Z"
    }
   },
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "[nltk_data] Downloading package stopwords to /home/stefan/nltk_data...\n",
      "[nltk_data]   Package stopwords is already up-to-date!\n"
     ]
    },
    {
     "data": {
      "text/plain": [
       "True"
      ]
     },
     "execution_count": 1,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "import nltk\n",
    "nltk.download('stopwords')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "metadata": {
    "ExecuteTime": {
     "end_time": "2018-12-28T02:16:31.728129Z",
     "start_time": "2018-12-28T02:16:30.925942Z"
    }
   },
   "outputs": [],
   "source": [
    "from pathlib import Path\n",
    "import numpy as np\n",
    "import pandas as pd\n",
    "from gensim.models import Doc2Vec\n",
    "from gensim.models.doc2vec import TaggedDocument\n",
    "import logging\n",
    "import warnings\n",
    "from random import shuffle\n",
    "import lightgbm as lgb\n",
    "from sklearn.model_selection import train_test_split\n",
    "from nltk import RegexpTokenizer\n",
    "from nltk.corpus import stopwords\n",
    "from sklearn.linear_model import LogisticRegression\n",
    "from sklearn.ensemble import RandomForestClassifier\n",
    "from sklearn.metrics import confusion_matrix, accuracy_score, roc_auc_score\n",
    "from sklearn.utils import class_weight\n",
    "import umap"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "metadata": {
    "ExecuteTime": {
     "end_time": "2018-12-28T02:16:35.063506Z",
     "start_time": "2018-12-28T02:16:35.061306Z"
    }
   },
   "outputs": [],
   "source": [
    "warnings.filterwarnings('ignore')\n",
    "pd.set_option('display.expand_frame_repr', False)\n",
    "np.random.seed(42)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 4,
   "metadata": {
    "ExecuteTime": {
     "end_time": "2018-12-28T02:16:35.246590Z",
     "start_time": "2018-12-28T02:16:35.238924Z"
    }
   },
   "outputs": [],
   "source": [
    "logging.basicConfig(\n",
    "        filename='doc2vec.log',\n",
    "        level=logging.DEBUG,\n",
    "        format='%(asctime)s - %(name)s - %(levelname)s - %(message)s',\n",
    "        datefmt='%H:%M:%S')"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Load Data"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 38,
   "metadata": {
    "ExecuteTime": {
     "end_time": "2018-12-28T02:31:45.448079Z",
     "start_time": "2018-12-28T02:31:21.942362Z"
    }
   },
   "outputs": [],
   "source": [
    "df = pd.read_parquet('combined.parquet', engine='fastparquet').loc[:, ['stars', 'text']]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 39,
   "metadata": {
    "ExecuteTime": {
     "end_time": "2018-12-28T02:31:45.481577Z",
     "start_time": "2018-12-28T02:31:45.449302Z"
    }
   },
   "outputs": [
    {
     "data": {
      "text/plain": [
       "5    2641880\n",
       "4    1335957\n",
       "1     858139\n",
       "3     673206\n",
       "2     487813\n",
       "Name: stars, dtype: int64"
      ]
     },
     "execution_count": 39,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "df.stars.value_counts()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 91,
   "metadata": {
    "ExecuteTime": {
     "end_time": "2018-12-28T02:49:18.719387Z",
     "start_time": "2018-12-28T02:49:18.715253Z"
    }
   },
   "outputs": [],
   "source": [
    "stars = range(1, 6)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 40,
   "metadata": {
    "ExecuteTime": {
     "end_time": "2018-12-28T02:32:45.883173Z",
     "start_time": "2018-12-28T02:32:45.238120Z"
    }
   },
   "outputs": [],
   "source": [
    "sample = pd.concat([df[df.stars==s].sample(n=100000) for s in stars])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 41,
   "metadata": {
    "ExecuteTime": {
     "end_time": "2018-12-28T02:32:47.838880Z",
     "start_time": "2018-12-28T02:32:47.743143Z"
    }
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "<class 'pandas.core.frame.DataFrame'>\n",
      "Int64Index: 500000 entries, 52085 to 3365007\n",
      "Data columns (total 2 columns):\n",
      "stars    500000 non-null int64\n",
      "text     500000 non-null object\n",
      "dtypes: int64(1), object(1)\n",
      "memory usage: 11.4+ MB\n"
     ]
    }
   ],
   "source": [
    "sample.info()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 66,
   "metadata": {
    "ExecuteTime": {
     "end_time": "2018-12-28T00:07:55.455558Z",
     "start_time": "2018-12-28T00:07:55.438396Z"
    }
   },
   "outputs": [],
   "source": [
    "sample.stars = (sample.stars == 5).astype(int)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 42,
   "metadata": {
    "ExecuteTime": {
     "end_time": "2018-12-28T02:32:54.195893Z",
     "start_time": "2018-12-28T02:32:54.187161Z"
    }
   },
   "outputs": [
    {
     "data": {
      "text/plain": [
       "5    100000\n",
       "4    100000\n",
       "3    100000\n",
       "2    100000\n",
       "1    100000\n",
       "Name: stars, dtype: int64"
      ]
     },
     "execution_count": 42,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "sample.stars.value_counts()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 43,
   "metadata": {
    "ExecuteTime": {
     "end_time": "2018-12-28T02:33:04.902169Z",
     "start_time": "2018-12-28T02:33:03.174809Z"
    }
   },
   "outputs": [],
   "source": [
    "sample.to_parquet('yelp_sample_5.parquet')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 12,
   "metadata": {
    "ExecuteTime": {
     "end_time": "2018-12-28T02:18:17.982897Z",
     "start_time": "2018-12-28T02:18:17.121098Z"
    }
   },
   "outputs": [],
   "source": [
    "sample = pd.read_parquet('yelp_sample.parquet').reset_index(drop=True)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 44,
   "metadata": {
    "ExecuteTime": {
     "end_time": "2018-12-28T02:33:08.533674Z",
     "start_time": "2018-12-28T02:33:08.526044Z"
    }
   },
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>stars</th>\n",
       "      <th>text</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>52085</th>\n",
       "      <td>1</td>\n",
       "      <td>Just terrible.\\n\\nI used to love Chili's - it ...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>527763</th>\n",
       "      <td>1</td>\n",
       "      <td>I love Cold Stone ice cream, but this location...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3797997</th>\n",
       "      <td>1</td>\n",
       "      <td>I don't understand why people give this place ...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4715860</th>\n",
       "      <td>1</td>\n",
       "      <td>Terrible disappointment.  It was a special cel...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2230375</th>\n",
       "      <td>1</td>\n",
       "      <td>Staff is awful. One called his coworker a bitc...</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "         stars                                               text\n",
       "52085        1  Just terrible.\\n\\nI used to love Chili's - it ...\n",
       "527763       1  I love Cold Stone ice cream, but this location...\n",
       "3797997      1  I don't understand why people give this place ...\n",
       "4715860      1  Terrible disappointment.  It was a special cel...\n",
       "2230375      1  Staff is awful. One called his coworker a bitc..."
      ]
     },
     "execution_count": 44,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "sample.head()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 17,
   "metadata": {
    "ExecuteTime": {
     "end_time": "2018-12-28T02:19:00.180749Z",
     "start_time": "2018-12-28T02:18:56.814179Z"
    }
   },
   "outputs": [
    {
     "data": {
      "image/png": "iVBORw0KGgoAAAANSUhEUgAAAYoAAAEKCAYAAAAMzhLIAAAABHNCSVQICAgIfAhkiAAAAAlwSFlzAAALEgAACxIB0t1+/AAAADl0RVh0U29mdHdhcmUAbWF0cGxvdGxpYiB2ZXJzaW9uIDMuMC4yLCBodHRwOi8vbWF0cGxvdGxpYi5vcmcvOIA7rQAAIABJREFUeJzt3XuYXNV55/vvW7e+qVvdaklIqAUtLBkQ2BagYGzGyQTGWHgyluPgE2EfB58wh5wcM4nnjM8EZh57Ep5cTGYm2H7scQYbYuJEBg/jOIpDjLl5HBJH0GAZkGShBoTUkpBa91tf6vLOH3tXqyiqq3Z3V3dd9Ps8Tz1Ve+21V63dau23116Xbe6OiIjIZGK1roCIiNQ3BQoRESlLgUJERMpSoBARkbIUKEREpCwFChERKUuBQkREylKgEBGRshQoRESkrEStK1ANCxcu9P7+/lpXQ0SkoTz33HOH3H1RpXxNESj6+/sZGBiodTVERBqKmb0eJZ9uPYmISFkKFCIiUpYChYiIlKVAISIiZSlQiIhIWQoUIiJSlgKFiIiUpUAxRXp0rIicaxQopuCvt+zl3X/4BNv3n6h1VURE5owCxRRs33+SgyfH+MR9z7Dr0OlaV0dEZE5EChRmts7MdpjZoJndUWJ/i5k9FO7fbGb9YXqvmT1lZqfM7MsF+TvNbEvB65CZfSHc90kzGy7Y96+rc6ozd3xknI5UnJw7H//6Zg6eHK11lUREZl3FQGFmceArwI3AauBmM1tdlO1W4Ki7rwTuAe4O00eBzwKfKczs7ifdfU3+BbwOfKcgy0MF+78+nRObDcdH0iztbuMb/9fPsffYCN/9yd5aV0lEZNZFaVFcDQy6+6vuPg48CKwvyrMeeCD8/DBwvZmZu59296cJAkZJZrYKWAz8/ZRrP8eOnUkzvy3JO/u66WlPsuvwmVpXSURk1kUJFMuAPQXbQ2FayTzungGOA70R63AzQQuicDjRr5jZC2b2sJktj1jOrDs+kqa7LQnABb0d7FagEJFzQJRAYSXSiseIRskzmQ3Atwq2/wbod/d3Ao9ztqXy5i80u83MBsxsYHh4OOJXzUy+RQFw4YJ2Xj+iDm0RaX5RAsUQUPhXfR+wb7I8ZpYA5gNHKhVsZu8CEu7+XD7N3Q+7+1i4+TXgqlLHuvu97r7W3dcuWlTxuRsztnHzbg6dGuONE6Ns3LybE6Np9h4dIZ3Nzfp3i4jUUpRA8SywysxWmFmKoAWwqSjPJuCW8PNNwJMebWbazby5NYGZLS3Y/BCwPUI5sy6bc8YyOdqScQB6O1LkHPYdG6lxzUREZlfFJ9y5e8bMbgceBeLA/e6+1czuAgbcfRNwH/BNMxskaElsyB9vZruALiBlZh8GbnD3beHu/wP4YNFX/paZfQjIhGV9cgbnVzWj6SwAbakgUCzoaAHg9cNnuLC3o2b1EhGZbZEeherujwCPFKV9ruDzKPDRSY7tL1PuRSXS7gTujFKvuTSSDxTJfKBIAfD6EXVoi0hz08zsiEbGg0DRHrYoOlsTJGLG7sPq0BaR5qZAEVFxiyJmRk9Hit1qUYhIk1OgiOhM2KJoDVsUEHRov665FCLS5BQoIsq3KNpTZ7t1FoQtCi09LiLNTIEiopHxDHD21hMEgeLMeJZDp8ZrVS0RkVmnQBHRyHiWVCJGPHZ2Enp+5JP6KUSkmSlQRDSSzr6pNQGFgUIjn0SkeSlQRDQynp0YGpvX057CDHVoi0hTU6CI6Ew6S2tRiyIZj7Gkq1WryIpIU1OgiKhUiwLgggXtmp0tIk1NgSKi0RJ9FADLF7QzdFSBQkSalwJFRGfGSweKvp42Dp4cYyyTrUGtRERmnwJFBKPpLJmcT6wcW2hZdxvusP/YpE97FRFpaAoUERwfSQOUDBR9Pe0ADB3VcylEpDkpUEQwESgmufUEsPeY+ilEpDkpUERw7MzkLYol81uJmVoUItK8FCgiyLco2pNvfc5TMh5j6fw29ipQiEiTUqCI4NiZYNG/Ui0KCDq01aIQkWYV6VGo57pyfRQbN+8mnc3x2qHTbNy8eyL9Y+++YM7qJyIym9SiiOD4SBoDWpKlf1zd7SmOj6TJ5vRcChFpPpEChZmtM7MdZjZoZneU2N9iZg+F+zebWX+Y3mtmT5nZKTP7ctExPwzL3BK+Fpcrq5aOj6RpTcaJmZXc39OexIETYctDRKSZVAwUZhYHvgLcCKwGbjaz1UXZbgWOuvtK4B7g7jB9FPgs8JlJiv+4u68JXwcrlFUzx0fSk/ZPQNCiADh6Rg8wEpHmE6VFcTUw6O6vuvs48CCwvijPeuCB8PPDwPVmZu5+2t2fJggYUZUsawrHV92xM+mS/RN5Pe1JAI6eUYtCRJpPlECxDNhTsD0UppXM4+4Z4DjQG6HsPwtvO322IBhMt6xZc3wkXXLl2Lz5bUmMs6OjRESaSZRAUeqv+eJe2yh5in3c3d8BvC98fWIqZZnZbWY2YGYDw8PDFb5qZvJ9FJNJxGN0tibUohCRphQlUAwBywu2+4B9k+UxswQwHzhSrlB33xu+nwQ2EtziilyWu9/r7mvdfe2iRYsinMb0jaazpOLlf1Td7Sm1KESkKUUJFM8Cq8xshZmlgA3ApqI8m4Bbws83AU+6+6QtCjNLmNnC8HMS+CXgpemUNRfSWScWK99N0tOeVGe2iDSlihPu3D1jZrcDjwJx4H5332pmdwED7r4JuA/4ppkNEvz1vyF/vJntArqAlJl9GLgBeB14NAwSceBx4GvhIZOWVSvpbI4KDQq621O8uPc4OfdJh9GKiDSiSDOz3f0R4JGitM8VfB4FPjrJsf2TFHvVJPknLatWMtkc8QoX/572FDkP5lLkh8uKiDQDzcyOIJ1z4hVuPXVriKyINCkFiggy2VzFPorutiBQHNfsbBFpMgoUFeRyTs6peOtpfrsChYg0JwWKCtK5HEDFW08tiThtybiGyIpI01GgqCCdDUbmRhnJ1N2eVItCRJqOAkUFmWy0FgUES3koUIhIs1GgqCDfoogaKI5p1JOINBkFigoy+T6KSLeeUoyks4xlsrNdLRGROaNAUUEm30cRoUUxMURWrQoRaSIKFBWkJ/ooKufNT7o7pn4KEWkiChQVTGXU03y1KESkCSlQVJBvUSQi3HrqbA0fYDSiuRQi0jwUKCrI5KL3UcRjRpeGyIpIk1GgqGBiHkXEpcM1RFZEmo0CRQXpKYx6gqBDW53ZItJMFCgqmMo8CgiGyJ4YSZPL1fShfCIiVaNAUUF6Ckt4QHDrKZNzDp9Wh7aINAcFigqmsoQHMPF0u33HRmatTiIic0mBooKpzMyGs3Mp9h9XoBCR5qBAUcF0+igA9h4bnbU6iYjMpUiBwszWmdkOMxs0sztK7G8xs4fC/ZvNrD9M7zWzp8zslJl9uSB/u5n9rZn9zMy2mtnnC/Z90syGzWxL+PrXMz/N6Zvqrae2VJxk3HTrSUSaRsVAYWZx4CvAjcBq4GYzW12U7VbgqLuvBO4B7g7TR4HPAp8pUfR/cfdLgCuAa83sxoJ9D7n7mvD19SmdUZXl51FEjBOYGd3tKYaOnpnFWomIzJ0oLYqrgUF3f9Xdx4EHgfVFedYDD4SfHwauNzNz99Pu/jRBwJjg7mfc/anw8zjwPNA3g/OYNVMd9QSwoD3F0FG1KESkOUQJFMuAPQXbQ2FayTzungGOA71RKmBm3cC/Ap4oSP4VM3vBzB42s+VRypktU731BMGkuz1H1KIQkeYQJVCUukIWzyaLkuetBZslgG8BX3L3V8PkvwH63f2dwOOcbakUH3ubmQ2Y2cDw8HClr5q2qXZmAyzoSHFiNKM1n0SkKUQJFENA4V/1fcC+yfKEF//5wJEIZd8L7HT3L+QT3P2wu4+Fm18Drip1oLvf6+5r3X3tokWLInzV9Ex1CQ+AnnAuhVoVItIMogSKZ4FVZrbCzFLABmBTUZ5NwC3h55uAJ929bIvCzH6fIKB8uih9acHmh4DtEeo4azLTuPXU0xEECnVoi0gzSFTK4O4ZM7sdeBSIA/e7+1YzuwsYcPdNwH3AN81skKAlsSF/vJntArqAlJl9GLgBOAH8R+BnwPMW3Nb5cjjC6bfM7ENAJizrk1U612lJZ3OYRXtwUd6CiRaFOrRFpPFVDBQA7v4I8EhR2ucKPo8CH53k2P5Jii155XX3O4E7o9RrLqRzOZJRnoNaoDUZo7MloRaFiDQFzcyuIJN1klO47QTBXIq+Be3s0RBZEWkCChQVZLI5ElNsUQAs72lTZ7aINAUFigrSOScZn1qLAmD5gnaGjo5QoU9fRKTuKVBUkMnmSMSm16IYSWc5dErPpRCRxqZAUUE66ySm0aLo62kHYI86tEWkwSlQVJDO5khNp49iQRAotOaTiDQ6BYoKMtNuUbQBmp0tIo1PgaKCTG56fRQdLQl6O7TcuIg0PgWKCtLZ6Y16AoK5FJqdLSINToGigkxuevMoIJxLoRaFiDQ4BYoK0hknMcWZ2Xl9Pe3sPTpCNqe5FCLSuBQoKkjncqQS0/sx9fe2k8m5np8tIg1NgaKCTHb6LYr+hR0AvHbodDWrJCIypxQoKkhPc60ngBVhoNh1WIFCRBqXAkUFmWmu9QSwuLOF9lRcLQoRaWgKFBVMd60nCJYbv7C3g10KFCLSwBQoKpjuWk95Kxa2s+uwhsiKSONSoKhgums95fX3drDnyBnS2VwVayUiMncUKCrI5Gbaouggk3MtDigiDUuBooL0DPoooGDkk/opRKRBRboCmtk6M9thZoNmdkeJ/S1m9lC4f7OZ9YfpvWb2lJmdMrMvFx1zlZm9GB7zJTOzMH2BmT1mZjvD956Zn+b0Zaa51tPGzbvZuHk3z+8+BsDDzw2xcfPualdPRGTWVQwUZhYHvgLcCKwGbjaz1UXZbgWOuvtK4B7g7jB9FPgs8JkSRX8VuA1YFb7Whel3AE+4+yrgiXC7Zmay1hNARypOSyLG4dNjVayViMjciXIFvBoYdPdX3X0ceBBYX5RnPfBA+Plh4HozM3c/7e5PEwSMCWa2FOhy9x978FDpPwc+XKKsBwrS55y7B6vHTnNmNgRDZBfOa+GwHokqIg0qSqBYBuwp2B4K00rmcfcMcBzorVDm0CRlnufu+8Oy9gOLSxVgZreZ2YCZDQwPD0c4janLhIv5JWfQogDonZfi0Cm1KESkMUW5Apb6c7p4OdQoeWaS/62Z3e9197XuvnbRokVTOTSyTDao0kxuPQH0drRw7EyaTE5DZEWk8US5Ag4Bywu2+4B9k+UxswQwHzhSocy+Sco8EN6ayt+iOhihjrMiHV7Yp7uER97CeSkcOHJat59EpPFECRTPAqvMbIWZpYANwKaiPJuAW8LPNwFPhn0PJYW3lE6a2TXhaKdfA/66RFm3FKTPuYkWxQz6KAB657UAqJ9CRBpSolIGd8+Y2e3Ao0AcuN/dt5rZXcCAu28C7gO+aWaDBC2JDfnjzWwX0AWkzOzDwA3uvg34TeAbQBvwd+EL4PPAt83sVmA38NFqnOh0ZMLZ1DO99bRwXgpA/RQi0pAqBgoAd38EeKQo7XMFn0eZ5ILu7v2TpA8Al5dIPwxcH6Ves208e/bW00xW4GhPJehsSXDghAKFiDQezcwuI3/raaajngAWd7Vw8ORo5YwiInVGgaKM/Cilmd56Aljc1crBE2Pk9PxsEWkwChRlpPMtihl2ZgOc19nKeDbHXj0/W0QajAJFGdWaRwFwXlcw8unlAydnXJaIyFxSoCgjPXHraeYtisWdrQC8fODUjMsSEZlLChRlpDPhqKcZLDOe15aK09WaYKdaFCLSYBQoyji71tPMWxQA53W18vJBBQoRaSwKFGWkqzThLm9xZwuDB09p5JOINBQFijLOzqOoXotiNJ1jz9EzVSlPRGQuKFCUMTGPogp9FBDMpQB1aItIY1GgKCNd5RbF4k4NkRWRxqNAUUa1+yhak3HOn9+qQCEiDUWBooxq91EAvH1JJzveUKAQkcahQFHG2QcXVe/HdOnSLl4ZPsV4Rk+7E5HGoEBRRrUeXFRo9dIu0llnp+ZTiEiDUKAoo9p9FACrz+8CYNu+E1UrU0RkNilQlFHtmdkA/b0dtCXjbN+vFoWINAYFijLyaz1Vax4FQDxmXLykk237j1etTBGR2aRAUUZ6FloUENx+2rbvBO5aykNE6p8CRRmZbI5EzDCrcqBY2sWJ0YweYiQiDSFSoDCzdWa2w8wGzeyOEvtbzOyhcP9mM+sv2HdnmL7DzD4Qpl1sZlsKXifM7NPhvt81s70F+z5YnVOdukzOq/IsimL5Dm31U4hII6gYKMwsDnwFuBFYDdxsZquLst0KHHX3lcA9wN3hsauBDcBlwDrgv5lZ3N13uPsad18DXAWcAf6qoLx78vvd/ZGZneL0pbO5qjyLotglSzox08gnEWkMUa6CVwOD7v6qu48DDwLri/KsBx4IPz8MXG/B/Zr1wIPuPuburwGDYXmFrgdecffXp3sSsyWTnZ0WRXsqwYreDnVoi0hDiBIolgF7CraHwrSSedw9AxwHeiMeuwH4VlHa7Wb2gpndb2Y9pSplZreZ2YCZDQwPD0c4jalLZ3NVnUNR6NLzu9i2Xy0KEal/Ua6Cpf6kLh6uM1messeaWQr4EPA/CvZ/FXgbsAbYD/zXUpVy93vdfa27r120aNHktZ+BdNZJVTlQbNy8m42bd5PO5NhzZIT7n36NjZt3V/U7RESqKcpVcAhYXrDdB+ybLI+ZJYD5wJEIx94IPO/uB/IJ7n7A3bPungO+xltvVc2ZTC43K7eeAM7vbgNg6KhGPolIfYsSKJ4FVpnZirAFsAHYVJRnE3BL+Pkm4EkPJglsAjaEo6JWAKuAZwqOu5mi205mtrRg85eBl6KeTLVlsl7VdZ4KXbCgnZjBa4f0ECMRqW+JShncPWNmtwOPAnHgfnffamZ3AQPuvgm4D/immQ0StCQ2hMduNbNvA9uADPApd88CmFk78H7gN4q+8o/NbA3BLapdJfbPmXQ2V9WVYwu1JuOc393Gq4dOz0r5IiLVUjFQAIRDVB8pSvtcwedR4KOTHPsHwB+USD9D0OFdnP6JKHWaC7M1jyLvooUd/MPgYS05LiJ1TTOzy0hnc1Vd56nYioXzyLqz+8iZWfsOEZGZUqAoI53NVX3UU6H+XvVTiEj9U6AoY7Ym3OW1JOMsUz+FiNQ5BYoy0jmftQl3eSsWzmPoyAgj49lZ/R4RkelSoCgjk82RnKXhsXkXLeog687zu4/O6veIiEyXAkUZs33rCeDCcD7Fj185PKvfIyIyXQoUZczmWk95Lck4fT3t/Gjn7KxXJSIyUwoUZaRzszvqKe/t53XywtBxhk+Ozfp3iYhMlQJFGbO5hEehi5d0AvCjl9WqEJH6o0BRRjo7+6OeAJbOb2XhvBZ+qEAhInVIgaKMTC5HcpY7swFiZvzC2xfxo5eHyeaKV3AXEaktBYoygltPc/Mj+sVLFnF8JM2WPRomKyL1RYGijPHs3LQoAN63chExgx/u0O0nEakvkVaPPVdlZnGZ8WJ/++J+lve0853n97J0fttE+sfefcGcfL+IyGTUophELufknFmfcFfo4iWd7D02wonR9Jx9p4hIJQoUk0jngmdEzFWLAuCSJV0A7Nh/cs6+U0SkEgWKSWSyweijuZhHkXdeVwvd7Um2v3Fizr5TRKQSBYpJTASKOWxRmBmXLOnileFTpLN66p2I1AcFikmMZ/O3nuauRQFw6ZJO0lnnlYN6mJGI1IdIgcLM1pnZDjMbNLM7SuxvMbOHwv2bzay/YN+dYfoOM/tAQfouM3vRzLaY2UBB+gIze8zMdobvPTM7xenJ1KCPAmDFwg5SiRjb31A/hYjUh4rDY80sDnwFeD8wBDxrZpvcfVtBtluBo+6+0sw2AHcDv2pmq4ENwGXA+cDjZvZ2d88/pecX3f1Q0VfeATzh7p8Pg9IdwO/M4BynZOPm3QAcOT0OwMCuo/gcTpZOxGOsWjyPHW+cwP38uftiEZFJRPlz+Wpg0N1fdfdx4EFgfVGe9cAD4eeHgevNzML0B919zN1fAwbD8sopLOsB4MMR6lh1uXApjTluUABw6ZIuToxm2HdsdO6/XESkSJTL4DJgT8H2UJhWMo+7Z4DjQG+FYx34gZk9Z2a3FeQ5z933h2XtBxZHO5XqyobNiJjNbR8FBPMpYgYv7j0+598tIlIsyszsUlfK4psxk+Upd+y17r7PzBYDj5nZz9z9RxHqE3xhEFxuA7jggurPXs5OtCjmPlB0tCRYtbiTnw4dI5dzYjWog4hIXpQWxRCwvGC7D9g3WR4zSwDzgSPljnX3/PtB4K84e0vqgJktDctaChwsVSl3v9fd17r72kWLFkU4jamZCBQ1aFEArLmgm+MjaTa/dqQm3y8ikhclUDwLrDKzFWaWIuic3lSUZxNwS/j5JuBJd/cwfUM4KmoFsAp4xsw6zKwTwMw6gBuAl0qUdQvw19M7tZnJee1aFBD0U6QSMb77k701+X4RkbyKgSLsc7gdeBTYDnzb3bea2V1m9qEw231Ar5kNAv8fwUgl3H0r8G1gG/B94FPhiKfzgKfN7KfAM8Dfuvv3w7I+D7zfzHYSjLT6fHVOdWryLYpa3fZJJWJcfn4Xj7y4n9F0tvIBIiKzJNLqse7+CPBIUdrnCj6PAh+d5Ng/AP6gKO1V4F2T5D8MXB+lXrOp1reeANYs7+H53cd4YvtB/uU7l9asHiJybtPM7EmMZYIJd6lE7X5EFy3qYHFnC995fqhmdRARUaCYxHgYKFpqGChiZtx0VR9P7TjI0NEzNauHiJzbFCgmMZYJ+gVakvGa1uPj11wIwDf/6fWa1kNEzl0KFJMYq4MWBcCy7jZuWL2Eh57do05tEakJBYpJjGVyxGxun0cxmVve28+xM2k2bSmeviIiMvsUKCYxlsmRSsSwGo56yrvmogVcfF4n3/jHXfhcrlAoIkLE4bHnovFMlpZEbfsn4Oxqtpcu7eK7W/byh4/8jBULO/jYu6u/bImISClqUUxiLJOref9EoTXLu2lPxXl653CtqyIi55j6uRLWmfE6CxSpRIxrLupl+xsnGT45VuvqiMg5pH6uhHUmaFHU/tZToWsu6iURM54eLH7Wk4jI7FGgmMRYJlvTWdmlzGtJcOUFPfxk91EOnVKrQkTmRn1dCetIvfVR5F27ciGZnPNn//BarasiIueI+rsS1omxdK7uWhQAizpbeMey+dz/9C4OntCjUkVk9tXflbBOjNdhH0XeDavPI5PLcc/jO2tdFRE5ByhQlJDJ5si605Kszx9P77wWPv7uC/n2wB4GD56sdXVEpMnV55Wwxuplnady/s11K2lLxrn7+ztqXRURaXKamV1CIwSKR7ce4L1v6+UH2w7wh3+7nf6FHQCasS0iVVe/V8IaGp94aFF99lHkvfdtC+lqTfB3L+3XGlAiMmsUKEqYeBZFHbcoIJit/S8uPY89R0fYuu9ErasjIk2qvq+ENdIIt57yrrigh8WdLTy69Y2J53yLiFRTpCuhma0zsx1mNmhmd5TY32JmD4X7N5tZf8G+O8P0HWb2gTBtuZk9ZWbbzWyrmf12Qf7fNbO9ZrYlfH1w5qc5NWcDRX3fegKIx4x1ly3h8OlxLRgoIrOiYqAwszjwFeBGYDVws5mtLsp2K3DU3VcC9wB3h8euBjYAlwHrgP8WlpcB/p27XwpcA3yqqMx73H1N+HpkRmc4DeMNcusp7+IlnVx+fhePbz/IS3uP17o6ItJkolwJrwYG3f1Vdx8HHgTWF+VZDzwQfn4YuN6CJ/6sBx509zF3fw0YBK529/3u/jyAu58EtgPLZn461TE20ZndGIHCzPjwmmW0t8T59ENb9MhUEamqKFfCZcCegu0h3npRn8jj7hngONAb5djwNtUVwOaC5NvN7AUzu9/MeiLUsaoaqY8ir70lwU1X9jF48BR3fW+bRkGJSNVEuRKWehZo8VVosjxljzWzecD/BD7t7vlhO18F3gasAfYD/7VkpcxuM7MBMxsYHq7uvfmxdI64GYl44wQKgFXndfIbv3ARGzfv5ktPDNa6OiLSJKJcCYeA5QXbfcC+yfKYWQKYDxwpd6yZJQmCxF+6+3fyGdz9gLtn3T0HfI3g1tdbuPu97r7W3dcuWrQowmlEV49LjEf1Ox+4hJuu6uOex1/m/qe1wqyIzFyUq+GzwCozW2FmKYLO6U1FeTYBt4SfbwKe9ODexyZgQzgqagWwCngm7L+4D9ju7n9SWJCZLS3Y/GXgpame1EyNZ3J1u85TJbGY8fmPvIMPXHYed31vG997oTimi4hMTcUlPNw9Y2a3A48CceB+d99qZncBA+6+ieCi/00zGyRoSWwIj91qZt8GthGMdPqUu2fN7J8BnwBeNLMt4Vf9h3CE0x+b2RqCW1S7gN+o4vlGUq/Poohi4+bdQDBr+2f7T/LpB7ewde8JfufGS2pcMxFpVNYMnZ5r1671gYGBqpS1cfNu7n/6NcazOf6fX3hbVcqslVNjGb76w0HSWefRf/vzLOtuq3WVRKSOmNlz7r62Ur7G/LN5lo1lsg3boig0ryXBr72nn3Q2xyfvf4ZjZ8ZrXSURaUCNfzWcBWOZ+ny63XSc19XKJ665kNcPn+HXv/EsZ8Yzta6SiDSY5rgaVtlYHT/dbjouWjSPL928hi17jvGbf/G8JuSJyJQoUJTQyMNjJ7Pu8qX80Ufewf96eZhbH3iW02NqWYhINM11NawCdw+fl91cP5qNm3eTzcFHr+rjHwcPc+MX/557f/RqraslIg1AT7grksk5OW+s5Tum4ooLekjGYzw0sIc/eWwHbak4H7v6AuKxUpPoRUTUoniLRlznaaouXzaff3PdSs7vbuOz332Jm/70H9l16HStqyUidUotiiJj6fwS483TmV3K4s5Wbr12BVv2HONvXtjHDff8iH/5zqWsvbCHj19zYa2rJyJ1RIGiSKMtMT4TZsYVF/SwYmEHDz8/xF/9ZC8Du45w8ZJO1vYvqHX1RKRONP/VcIrG87eeGnStp+nobk/x69eu4Feu7OP4SJqb/vTH/OZEkGrMAAAMrklEQVRfPKfbUSICqEXxFo30GNRqiplx1YU9vGPZfE6MpvnT//UKj28/wK/+3HJ+5co+1izvJljLUUTONQoURcYa7DGo1ZZKxFg4r4Xfvn4Vj28/wLee2cNf/NNuFnSk+L/fdxE3X72c7vZUraspInNIgaLI+Dkw6imKztYkv3xFHzdevpSt+07w/O6j3P39n/HFJ17mI1f28evX9rNycWetqykic0CBosi5eutpMq3JOFdd2MNVF/aw//gIP37lMN9+dg8bN+9m1eJ5vPdtC/lP/2o1Mc3DEGlaChRF8reezoVRT1O1dH4bH7myjxsuW8Izrx1h82uHeeDHu/j7wWE+9K7zuXblQtYs7ybZYI+QFZHyFCiKjGVyJGKmmcplzGtJcN0li/n5ty/kpb3HeWX4NF98YidfeHwnLYkYq8/v4p3L5vOety3kvSt76WpN1rrKIjIDChRFmmmJ8dmWiMVYs7yHNct7uPHyJbw6fJrXD59m77FRvvXMHh748evEY8YVy7v5+bcv4n2rFvLOvm4FYZEGo0BRpBkXBJwL7akEly+bz+XL5gOQzTm7j5xh58GT7Dxwinsee5k/eexl2pJxrrt0MSsXzWPhvBTnd7dx+bL5nNfVWuMzEJHJKFAUSGdzvDp8iqXz9cjQmYrHjBULO1ixsIMbVsPpsQyDw6cYPHCKp3ce4pEX9lP4EN7O1gRX9y/g8mXzuXRpFysXd3DBgg617kTqgAJFgce2HeDEaIb1a7R8RbV1tCR4V1837+rrBoIWx5nxDEdOj7P32Ah7j46w+8gZntxxkMLHuLen4sxrSbB8QTurFs/jgt52ejtSLOhooa+njeUL2pnXol9jkdkU6X+Yma0DvgjEga+7++eL9rcAfw5cBRwGftXdd4X77gRuBbLAb7n7o+XKNLMVwIPAAuB54BPuPicPe/7zH++iuz3JxUs0P2C2xWNGZ2uSztYkF/Z2TKSPZbIMnxxj+OQYR86MM5bOMTKe5fCpcX62fx+nx9/6dL72VJzO1gQ97Sn6ezu4aFEHiztb6G5PMb89yfy2JN1twfv8tiQJjcoSmZKKgcLM4sBXgPcDQ8CzZrbJ3bcVZLsVOOruK81sA3A38KtmthrYAFwGnA88bmZvD4+ZrMy7gXvc/UEz+9Ow7K9W42TLefnASf7p1SN84LIlxLRURc20JOL09bTT19Necv94JseZ8QynxoLWyJHT45weyzCWyXFqLMPA60f5wbY3yHnJwwHobEnQ1Zakqy1JeypOeypOWzJ8T8VpSyboaInT056id16K1uTZOTVGsJhiazJGR0uCeS0J2lNxOlIJUokYqUSMRMy03Ik0lSgtiquBQXd/FcDMHgTWA4WBYj3wu+Hnh4EvW/A/ZT3woLuPAa+Z2WBYHqXKNLPtwHXAx8I8D4Tlzkqg2LbvBC/uPcZl58/nLzfvJpWIsfbCntn4KqmS4GKcors9NWkwyeackXSW0fEsZ9JZRsazjKQznBnPMpLfHs8yms5y7Mw4B0/kSGed8WyOdCbHeDbHeCZHmVhTuZ7xGMm4kUzESMZjJGNGPG7ELRh6nYjFgvd4fvvN6a3JGO2pBG2pOO3JOK3JOBODxcIgZARrdMUMYjE7+9kMC9/jMaMlEQS1lkQMJ3iKozvkHHLu5MJtx8nlmDhvD+8Bvunn4Pm3cJ+/ZddEmhccaRjJuJFKxGgJA2oqHp9IS8RimAUtzfyrIxUE7JgZmZyDQzwe/KxiFr5XaQSd+5v/tacS6N39TfndnWzOiRf8wZDO5nCHZDxIy4W/o/l/HzNjLJPl9FiWlkSM9lQcdzg+kubkaIb5bUm62hKMpnPsPTbCqbEM589vZeG8ljmZ7BolUCwD9hRsDwHvniyPu2fM7DjQG6b/U9Gxy8LPpcrsBY65e6ZE/qr7wbY3+MLjOye2P3LlMjp0v7vhxWPGvPCv/enKuTOaznJqLEMm++aLiHvwH38sk2Msk2U8E3zO5pxseJHIvzI5J5vLkcsFZWY9eIJiLue4O2NpP5ueO3vhTofBajzrpDM50tlgxYCZBK9mZMZE4HCAfMDzMCDm8xFc/GMWBC0zJoJpJucTF/JSCoNvYRmOk86eDQrJuJHzs8sAxSz4wyaTDX4P8mnJeGxiBYh8WiIemzgOmBhCni1oGsdj9qZtCP4g+b31l3Hz1RfM6OdYSZT/SaXCVfGPdLI8k6WXuklcLv9bK2V2G3BbuHnKzHaUyjcV9wRvC4FDMy2rhlT/2lL9a6eR6w7TrP/H/vDsLZhpiPSUsiiBYghYXrDdB+ybJM+QmSWA+cCRCseWSj8EdJtZImxVlPouANz9XuDeCPWfEjMbcPe11S53rqj+taX6104j1x3qu/5Rhn88C6wysxVmliLonN5UlGcTcEv4+SbgSQ9u+m0CNphZSziaaRXwzGRlhsc8FZZBWOZfT//0RERkpiq2KMI+h9uBRwmGst7v7lvN7C5gwN03AfcB3ww7q48QXPgJ832boOM7A3zK3bMApcoMv/J3gAfN7PeBn4Rli4hIjVhxb/+5zsxuC29rNSTVv7ZU/9pp5LpDfddfgUJERMrSFFURESlLgaKAma0zsx1mNmhmd9S6PqWY2f1mdtDMXipIW2Bmj5nZzvC9J0w3M/tSeD4vmNmVtas5mNlyM3vKzLab2VYz++0Gq3+rmT1jZj8N6/97YfoKM9sc1v+hcIAG4SCOh8L6bzaz/lrWP8/M4mb2EzP7XrjdMPU3s11m9qKZbTGzgTCtUX5/us3sYTP7Wfh/4D2NUncFipCdXarkRmA1cLMFS5DUm28A64rS7gCecPdVwBPhNgTnsip83cYcLIVSQQb4d+5+KXAN8KnwZ9wo9R8DrnP3dwFrgHVmdg1nl51ZBRwlWHYGCpa2IZimc3cN6lzKbwPbC7Ybrf6/6O5rCoaSNsrvzxeB77v7JcC7CP4NGqPuwXR+vYD3AI8WbN8J3Fnrek1S137gpYLtHcDS8PNSYEf4+b8DN5fKVw8vgqHP72/E+gPtBItWvptg/k+i+PeIYFTfe8LPiTCf1bjefQQXpOuA7xFMcm2k+u8CFhal1f3vD9AFvFb882uEuru7WhQFSi1VMmvLh1TZee6+HyB8Xxym1+05hbcxrgA200D1D2/bbAEOAo8BrzD5sjNvWtoGyC9tU0tfAP49kF8votyyOfVYfwd+YGbPWbA6AzTG789FwDDwZ+Ftv6+bWQeNUXcFigKRlw9pIHV5TmY2D/ifwKfd/US5rCXSalp/d8+6+xqCv8yvBi4tlS18r6v6m9kvAQfd/bnC5BJZ67L+oWvd/UqCWzOfMrOfL5O3nuqfAK4EvuruVwCnOXubqZR6qrsCRYEoS5XUqwNmthQgfD8YptfdOZlZkiBI/KW7fydMbpj657n7MeCHBH0t3RYsXQNvruNE/e3NS9vUyrXAh8xsF8EzX64jaGE0Sv1x933h+0HgrwiCdSP8/gwBQ+6+Odx+mCBwNELdFSgKRFmqpF4VLqFSuOzJJuDXwhEU1wDH883cWjAzI5hpv93d/6RgV6PUf5GZdYef24B/QdAhOdmyM5MtbVMT7n6nu/e5ez/B7/eT7v5xGqT+ZtZhZp35z8ANwEs0wO+Pu78B7DGzi8Ok6wlWrKj7ugPqzC58AR8EXia47/wfa12fSer4LWA/kCb4q+NWgvvGTwA7w/cFYV4jGMn1CvAisLbGdf9nBM3nF4At4euDDVT/dxIsK/MCwQXqc2H6RQRrmA0C/wNoCdNbw+3BcP9Ftf79KTiXfw58r5HqH9bzp+Fra/7/aAP9/qwBBsLfn+8CPY1Sd83MFhGRsnTrSUREylKgEBGRshQoRESkLAUKEREpS4FCRETKUqAQmaZwNdD/d5rH9pvZx6pdJ5HZoEAhMn3dwLQCBcHCjgoU0hAUKESm7/PA28JnI/xnM/v/zezZ8PkB+WdV/Fy43RrOLN5qZpeHx74vPPbf1vQsRCrQhDuRaQpXwP2eu19uZjcQLHPxGwSzajcBf+zuPzKz3yeY5dxGsN7PH5nZPwc+4+6/VJPKi0xBonIWEYnghvD1k3B7HsFDZ34E3EWwltgo8Fs1qZ3IDChQiFSHAX/k7v+9xL4FBIEjSdCyOD2XFROZKfVRiEzfSaAz/Pwo8OvhszYws2Vmln8Izb3AZ4G/5OzjRAuPFalralGITJO7HzazfzCzl4C/AzYCPw5WU+cU8H+a2Tog4+4bw+ey/6OZXQf8PZAxs58C33D3e2p0GiIVqTNbRETK0q0nEREpS4FCRETKUqAQEZGyFChERKQsBQoRESlLgUJERMpSoBARkbIUKEREpKz/DecErv0c3J59AAAAAElFTkSuQmCC\n",
      "text/plain": [
       "<Figure size 432x288 with 1 Axes>"
      ]
     },
     "metadata": {
      "needs_background": "light"
     },
     "output_type": "display_data"
    }
   ],
   "source": [
    "sns.distplot(sample.text.str.split().str.len());"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Doc2Vec"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### Basic text cleaning"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 45,
   "metadata": {
    "ExecuteTime": {
     "end_time": "2018-12-28T02:33:19.647682Z",
     "start_time": "2018-12-28T02:33:19.644233Z"
    }
   },
   "outputs": [],
   "source": [
    "tokenizer = RegexpTokenizer(r'\\w+')\n",
    "stopword_set = set(stopwords.words('english'))\n",
    "\n",
    "def clean(review):\n",
    "    tokens = tokenizer.tokenize(review)\n",
    "    return ' '.join([t for t in tokens if t not in stopword_set])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 46,
   "metadata": {
    "ExecuteTime": {
     "end_time": "2018-12-28T02:33:35.354851Z",
     "start_time": "2018-12-28T02:33:20.198492Z"
    }
   },
   "outputs": [],
   "source": [
    "sample.text = sample.text.str.lower().apply(clean)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 47,
   "metadata": {
    "ExecuteTime": {
     "end_time": "2018-12-28T02:33:35.368047Z",
     "start_time": "2018-12-28T02:33:35.355960Z"
    }
   },
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>stars</th>\n",
       "      <th>text</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>3713191</th>\n",
       "      <td>1</td>\n",
       "      <td>called 938 placed order informer ian manager a...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3632813</th>\n",
       "      <td>3</td>\n",
       "      <td>ok best tip sell stuff buffalo exchange sharin...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1414414</th>\n",
       "      <td>5</td>\n",
       "      <td>afford rooms well worth money absolutely amazi...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4609094</th>\n",
       "      <td>3</td>\n",
       "      <td>little bit pricier nw competition peak hours d...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4996179</th>\n",
       "      <td>4</td>\n",
       "      <td>great pigging comfort food visiting great frie...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1826950</th>\n",
       "      <td>5</td>\n",
       "      <td>went sun auto fri mar 9th dealt patrick mantan...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4210188</th>\n",
       "      <td>5</td>\n",
       "      <td>went nail salon must say impressed level custo...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1354353</th>\n",
       "      <td>5</td>\n",
       "      <td>rita must love custard black cherry little bit...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2760</th>\n",
       "      <td>1</td>\n",
       "      <td>drittes goa pfaffing erlebt absolut nix unterh...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1118726</th>\n",
       "      <td>1</td>\n",
       "      <td>visited week ago im finally writing review pla...</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "         stars                                               text\n",
       "3713191      1  called 938 placed order informer ian manager a...\n",
       "3632813      3  ok best tip sell stuff buffalo exchange sharin...\n",
       "1414414      5  afford rooms well worth money absolutely amazi...\n",
       "4609094      3  little bit pricier nw competition peak hours d...\n",
       "4996179      4  great pigging comfort food visiting great frie...\n",
       "1826950      5  went sun auto fri mar 9th dealt patrick mantan...\n",
       "4210188      5  went nail salon must say impressed level custo...\n",
       "1354353      5  rita must love custard black cherry little bit...\n",
       "2760         1  drittes goa pfaffing erlebt absolut nix unterh...\n",
       "1118726      1  visited week ago im finally writing review pla..."
      ]
     },
     "execution_count": 47,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "sample.sample(n=10)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 48,
   "metadata": {
    "ExecuteTime": {
     "end_time": "2018-12-28T02:33:57.874953Z",
     "start_time": "2018-12-28T02:33:55.863246Z"
    }
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "<class 'pandas.core.frame.DataFrame'>\n",
      "Int64Index: 485681 entries, 52085 to 3365007\n",
      "Data columns (total 2 columns):\n",
      "stars    485681 non-null int64\n",
      "text     485681 non-null object\n",
      "dtypes: int64(1), object(1)\n",
      "memory usage: 11.1+ MB\n"
     ]
    }
   ],
   "source": [
    "sample = sample[sample.text.str.split().str.len()>10]\n",
    "sample.info()"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### Create sentence stream"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 49,
   "metadata": {
    "ExecuteTime": {
     "end_time": "2018-12-28T02:34:06.903097Z",
     "start_time": "2018-12-28T02:34:01.100396Z"
    }
   },
   "outputs": [],
   "source": [
    "sentences = []\n",
    "for i, (_, text) in enumerate(sample.values):\n",
    "    sentences.append(TaggedDocument(words=text.split(), tags=[i]))"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### Formulate the model"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 50,
   "metadata": {
    "ExecuteTime": {
     "end_time": "2018-12-28T02:34:06.906431Z",
     "start_time": "2018-12-28T02:34:06.904273Z"
    }
   },
   "outputs": [],
   "source": [
    "size=300\n",
    "window=5\n",
    "min_count=0\n",
    "epochs=5\n",
    "negative=5\n",
    "dm = 1\n",
    "dm_concat=0\n",
    "dbow_words=0\n",
    "workers = 8"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "model = Doc2Vec(documents=sentences,\n",
    "                dm=1,\n",
    "                size=size,\n",
    "                window=window,\n",
    "                min_count=min_count,\n",
    "                workers=workers,\n",
    "                epochs=epochs,\n",
    "                negative=negative,\n",
    "                dm_concat=dm_concat,\n",
    "                dbow_words=dbow_words)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 51,
   "metadata": {
    "ExecuteTime": {
     "end_time": "2018-12-28T02:37:01.442409Z",
     "start_time": "2018-12-28T02:34:07.761377Z"
    }
   },
   "outputs": [],
   "source": [
    "model = Doc2Vec(documents=sentences,\n",
    "                dm=dm,\n",
    "                size=size,\n",
    "                window=window,\n",
    "                min_count=min_count,\n",
    "                workers=workers,\n",
    "                epochs=epochs,\n",
    "                negative=negative,\n",
    "                dm_concat=dm_concat,\n",
    "                dbow_words=dbow_words)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 90,
   "metadata": {
    "ExecuteTime": {
     "end_time": "2018-12-28T01:56:03.959942Z",
     "start_time": "2018-12-28T01:52:41.532880Z"
    }
   },
   "outputs": [],
   "source": [
    "model.train(sentences, total_examples=model.corpus_count, epochs=model.epochs)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 52,
   "metadata": {
    "ExecuteTime": {
     "end_time": "2018-12-28T02:37:01.604728Z",
     "start_time": "2018-12-28T02:37:01.443607Z"
    }
   },
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>token</th>\n",
       "      <th>similarity</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>great</td>\n",
       "      <td>0.869434</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>decent</td>\n",
       "      <td>0.824517</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>ok</td>\n",
       "      <td>0.759463</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>bad</td>\n",
       "      <td>0.749852</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>amazing</td>\n",
       "      <td>0.748687</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>5</th>\n",
       "      <td>awesome</td>\n",
       "      <td>0.733886</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>6</th>\n",
       "      <td>okay</td>\n",
       "      <td>0.719382</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>7</th>\n",
       "      <td>tasty</td>\n",
       "      <td>0.707188</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>8</th>\n",
       "      <td>nice</td>\n",
       "      <td>0.702621</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>9</th>\n",
       "      <td>delicious</td>\n",
       "      <td>0.692123</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "       token  similarity\n",
       "0      great    0.869434\n",
       "1     decent    0.824517\n",
       "2         ok    0.759463\n",
       "3        bad    0.749852\n",
       "4    amazing    0.748687\n",
       "5    awesome    0.733886\n",
       "6       okay    0.719382\n",
       "7      tasty    0.707188\n",
       "8       nice    0.702621\n",
       "9  delicious    0.692123"
      ]
     },
     "execution_count": 52,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "pd.DataFrame(model.most_similar('good'), columns=['token', 'similarity'])"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Persist Model"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 53,
   "metadata": {
    "ExecuteTime": {
     "end_time": "2018-12-28T02:37:02.197070Z",
     "start_time": "2018-12-28T02:37:01.605570Z"
    }
   },
   "outputs": [],
   "source": [
    "model.save('sample5.model')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 6,
   "metadata": {
    "ExecuteTime": {
     "end_time": "2018-12-28T00:54:04.864287Z",
     "start_time": "2018-12-28T00:54:03.581152Z"
    }
   },
   "outputs": [],
   "source": [
    "model = Doc2Vec.load('sample.model')"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Evaluate"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 62,
   "metadata": {
    "ExecuteTime": {
     "end_time": "2018-12-28T02:38:50.845013Z",
     "start_time": "2018-12-28T02:38:50.804633Z"
    }
   },
   "outputs": [],
   "source": [
    "y = sample.stars.sub(1)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 55,
   "metadata": {
    "ExecuteTime": {
     "end_time": "2018-12-28T02:37:03.062359Z",
     "start_time": "2018-12-28T02:37:02.201730Z"
    }
   },
   "outputs": [],
   "source": [
    "X = np.zeros(shape=(len(y), size))\n",
    "for i in range(len(sample)):\n",
    "    X[i] = model.docvecs[i]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 56,
   "metadata": {
    "ExecuteTime": {
     "end_time": "2018-12-28T02:37:03.065414Z",
     "start_time": "2018-12-28T02:37:03.063244Z"
    }
   },
   "outputs": [
    {
     "data": {
      "text/plain": [
       "(485681, 300)"
      ]
     },
     "execution_count": 56,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "X.shape"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### Train-Test Split"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 63,
   "metadata": {
    "ExecuteTime": {
     "end_time": "2018-12-28T02:39:04.085998Z",
     "start_time": "2018-12-28T02:39:03.631216Z"
    }
   },
   "outputs": [],
   "source": [
    "X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 64,
   "metadata": {
    "ExecuteTime": {
     "end_time": "2018-12-28T02:39:06.672781Z",
     "start_time": "2018-12-28T02:39:06.665668Z"
    }
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Baseline Score: 20.16%\n"
     ]
    }
   ],
   "source": [
    "mode = pd.Series(y_train).mode().iloc[0]\n",
    "baseline = accuracy_score(y_true=y_test, y_pred=np.full_like(y_test, fill_value=mode))\n",
    "print(f'Baseline Score: {baseline:.2%}')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 26,
   "metadata": {
    "ExecuteTime": {
     "end_time": "2018-12-27T23:35:39.040108Z",
     "start_time": "2018-12-27T23:35:38.953294Z"
    }
   },
   "outputs": [],
   "source": [
    "class_weights = class_weight.compute_class_weight('balanced',\n",
    "                                                 np.unique(y_train),\n",
    "                                                 y_train)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 27,
   "metadata": {
    "ExecuteTime": {
     "end_time": "2018-12-27T23:35:39.240067Z",
     "start_time": "2018-12-27T23:35:39.237696Z"
    }
   },
   "outputs": [
    {
     "data": {
      "text/plain": [
       "array([0.52585038, 1.59482003, 2.12184306])"
      ]
     },
     "execution_count": 27,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "class_weights"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## LightGBM"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 65,
   "metadata": {
    "ExecuteTime": {
     "end_time": "2018-12-28T02:39:10.223141Z",
     "start_time": "2018-12-28T02:39:10.217963Z"
    }
   },
   "outputs": [],
   "source": [
    "train_data = lgb.Dataset(data=X_train, label=y_train)\n",
    "test_data = train_data.create_valid(X_test, label=y_test)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 66,
   "metadata": {
    "ExecuteTime": {
     "end_time": "2018-12-28T02:39:10.908403Z",
     "start_time": "2018-12-28T02:39:10.901251Z"
    }
   },
   "outputs": [],
   "source": [
    "params = {'objective': 'multiclass',\n",
    "          'num_classes': 5}"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 67,
   "metadata": {
    "ExecuteTime": {
     "end_time": "2018-12-28T02:42:04.449691Z",
     "start_time": "2018-12-28T02:39:11.555708Z"
    },
    "scrolled": true
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "[25]\ttraining's multi_logloss: 1.50257\tvalid_1's multi_logloss: 1.51211\n",
      "[50]\ttraining's multi_logloss: 1.45251\tvalid_1's multi_logloss: 1.4704\n",
      "[75]\ttraining's multi_logloss: 1.41546\tvalid_1's multi_logloss: 1.44103\n",
      "[100]\ttraining's multi_logloss: 1.38507\tvalid_1's multi_logloss: 1.41809\n",
      "[125]\ttraining's multi_logloss: 1.35921\tvalid_1's multi_logloss: 1.39942\n",
      "[150]\ttraining's multi_logloss: 1.33601\tvalid_1's multi_logloss: 1.38295\n",
      "[175]\ttraining's multi_logloss: 1.31554\tvalid_1's multi_logloss: 1.36904\n",
      "[200]\ttraining's multi_logloss: 1.29656\tvalid_1's multi_logloss: 1.35624\n",
      "[225]\ttraining's multi_logloss: 1.27918\tvalid_1's multi_logloss: 1.34486\n",
      "[250]\ttraining's multi_logloss: 1.26276\tvalid_1's multi_logloss: 1.33447\n"
     ]
    }
   ],
   "source": [
    "lgb_model = lgb.train(params=params,\n",
    "                      train_set=train_data,\n",
    "                      num_boost_round=250,\n",
    "                      valid_sets=[train_data, test_data],\n",
    "                      verbose_eval=25)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 72,
   "metadata": {
    "ExecuteTime": {
     "end_time": "2018-12-28T02:43:01.722585Z",
     "start_time": "2018-12-28T02:43:00.450410Z"
    }
   },
   "outputs": [],
   "source": [
    "y_pred = np.argmax(lgb_model.predict(X_test), axis=1)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 88,
   "metadata": {
    "ExecuteTime": {
     "end_time": "2018-12-28T02:48:08.595370Z",
     "start_time": "2018-12-28T02:48:08.514152Z"
    }
   },
   "outputs": [],
   "source": [
    "cm = confusion_matrix(y_true=y_test, y_pred=y_pred)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 99,
   "metadata": {
    "ExecuteTime": {
     "end_time": "2018-12-28T03:56:59.129287Z",
     "start_time": "2018-12-28T03:56:58.910377Z"
    }
   },
   "outputs": [
    {
     "data": {
      "image/png": "iVBORw0KGgoAAAANSUhEUgAAAWQAAAD8CAYAAABAWd66AAAABHNCSVQICAgIfAhkiAAAAAlwSFlzAAALEgAACxIB0t1+/AAAADl0RVh0U29mdHdhcmUAbWF0cGxvdGxpYiB2ZXJzaW9uIDMuMC4yLCBodHRwOi8vbWF0cGxvdGxpYi5vcmcvOIA7rQAAIABJREFUeJzs3Xd4k9XbwPHvSdIFZe9dBGQpG0QUUAFllyF7r7KHbGQPUURBlL2RvX4oKIjKElzsPZS9aZltoUmb9Lx/pNSWbm2bNO/94cplcs55ntzHJ71zcp6ltNYIIYRwPIOjAxBCCGEnCVkIIZyEJGQhhHASkpCFEMJJSEIWQggnIQlZCCGchCRkIYRwEpKQhRDCSUhCFkIIJ2FK6TfwqjfT5U4F/HNhD0eHkCJezuPt6BCSnaueiPr33WBHh5AiyhTwVv91HV7l+yV6q4ccm/2f3y85yQhZCCGcRIqPkIUQIlWptDvOlIQshHAtBqOjI/jXJCELIVyLcqpp4SSRhCyEcC1peMoi7UYuhBCxUSrxjwRXpeoqpS4opS4qpUbGUl9DKXVUKWVVSr0fpbycUup3pdQZpdRJpVSrxIQuI2QhhGtJphGyUsoIzAHqADeBQ0qprVrrs1GaXQc6A0NfWPwZ0FFr/bdSKi9wRCm1U2v9OL73lIQshHAtyTeHXAW4qLW+bF+tWgf4ApEJWWt9NaIuPOqCWuu/ojy/rZTyB3IAkpCFEP+PJN9RFvmAG1Fe3wReS+pKlFJVAHfgUkJtZQ5ZCOFalCHRD6WUn1LqcJSHX9Q1xbL2JJ37qZTKA6wEumitwxNqLyNkIYRrScKUhdZ6IbAwjuqbQIEor/MDtxMfhsoIfA+M0Vr/kZhlZIQshHAtSRghJ+AQUEwpVVgp5Q60BrYmKgR7+y3A11rrjYkNXRKyEMK1JFNC1lpbgX7ATuAcsEFrfUYpNUkp1RhAKVVZKXUTaAEsUEqdiVi8JVAD6KyUOh7xKJdQ6DJlIYRwLcbkO3Vaa70d2P5C2bgozw9hn8p4cblVwKqkvp8kZCGEa5FTp4UQwknIqdPJZ/4Hdbi2tieH53WILGv2ZjGOzO/I0+8HUaFYrniXNxgUv89ux+YJvpFly4bX5eDc9kzs9EZk2cg2r9Gw6kvJ34FECA21MLJvR4b6teaDbi1Yv2J+jDY/btvE4O4tGdqzDWMGduXGtcsAnD99nCE9WjGyTwfu3LIfIvk0OIgpI/qiHXw19nFjRvFW9ddp5tsw1nqtNZ9MnULDunV4v2kjzp21T7ddvXKZ1i2a0aJpY04cPwaA1WrFr1tnQkJCUiv8WI0fM4q3a7xO8yax9+nK5Ut0bNeKyuVfYcWyJZHlDx8+pHOHNjRv0pDdu36OLB/Uvzf+/vdSPO74uOrnL1Iynjqd2pwuIa/86Sy+Y7ZEKztz7QGtJ2/jwOmbCS7fz7c8F64/jHz9ik92AKr0WcUbr+QjYzp3cmdJT6Xiufnuj8vJG3wiubm5M/6z+Xy2cB3TF6zh+KHf+OvsqWht3nynLjMWb+CzBWvxbdWJFfNmALBt0yqGjp9Om259+XGbfeftplWLaNq2K8rBHzDfJs2Yt2BxnPUH9v/C9WtX2bbjR8ZNmMyUSRMA2LhhPQM/GMJnX3zJimVLAdiwfi0NG/ni5eWVGqHHqXGTZsydH3efMmXKzPCRo+nYuVu08h+2f0cj36Z8vXodK5bZl9+3dzclSpYmZ874BxUpzVU/f5GS7yiLVOd0Ef16+hYPg8zRyi7ceMjftx4luGy+7N7UrVKYZTtPR5aF2cLxcjehFLibDNjCNWM7vM7klb8le+yJpZTCyysdADarFZvVGuPLOl36f26nZDGHRH7YjUYToaEWQs1mjEYTd2/f4OH9AEqXrZhq8celYqXKZMyUKc76Pbt30ahxE5RSlClbjqCgQAIC/DGZTFjMZswhZkxuJgIDA9m3dw+NfJukYvSxS6hPWbNl45VXy2AyRZ/9e96n0NBQDAYDVquV1StX0KlLtzjWlHpc9fMXKQ2PkF1qDnl6z7cYvWQ/3l7ukWUXbjzkRkAQv3/VjrW7z1Ekb2aUghOXAhwYKdhsNkb0ac/dWzeo69uSYiVfjdHmh2838N2mVVitVsZPt/+sbNqmCwtmTMHdw4P+Iyfz9YIvaN25d2qH/6/4+98jV+7cka9z5cqN/717tG7TjjEfjiA0NJSx4yeyYN4cevj1cp4R179Qr0EjRg0fwndbv2Hg4GFsWLeGho2bOHzE/5xLf/7+P16gXinVRWu9LDmD+S/qVSmM/+NnHLvoT/VXox+FMmzBvsjnmyb40v/LnxneugplCudg17FrLPvh9IurS3FGo5HPFqzlaXAQ08cP4fqVixQsXDRam7q+Lanr25L9u3awefVi+o2YROGixZk6ewUAZ08eJUu2HGg0MyaPxGQy0bHXB2TOki3V+5MoscwxKqXIkzcvS5avBOD6tWsEBPhTuPBLfDhyGGFhYfTtPxAfn8KpHe1/kiFDBmbPs58AFvjkCcuWLGLGrK+YOH4MQYGBdOjUhbLlyjssPpf+/DnhVERi/ZfIJ8ZVEfX8cOuN3//DWyTe66Xy0rDqS5xf3pWvR9bnrbIFWDqsbrQ2Dau+xNG/7pHO043ShbLR/uPvaftOSbw8HPdDIb13BkqXrcTxQ3FPobzx9nsc/HVvtDKtNZtXL+b99t3Z+PVCWnXqSfVa9dm+ZV0KR/zv5cyVm3t370a+vnfvLjly5ozW5qsvZ9K3/0DWrF5JgwaN6NO3Pwvmzk7tUJPVgvlz6O7Xix3bv6dUqdJMmDyVr2bNcHRYgIt+/tLwlEW8CTniwsqxPU4Bce6Z0Fov1FpX0lpXMhV4PdmDjs245b9StMNiSnReSsdPtrP3xA26Tv8hst5kNNDXtzwzNx8mnYcpcrBmMCjcTan7E+fJ40c8DQ4CwGIxc/Lon+Qr6BOtzZ2b1yOfH/3zAHnyF4xWv/fHbVR47U28M2TEYjGjDAYMBkWoOfr8uzN56+132Lb1G7TWnDxxHG/vDOTI8U9CPnzoIDlz5qJQIR/MISH2PhmNhDhxnxJy7dpVAvz9qVS5SmSflILQ0FCHxeTyn780vFMvoaFhLuA94MU9agpIkb1iK0bUo3qZAmTP6MnFld2ZvPJ3HgWbmdH7bbJn8uJ/E305eTmAxmO2kCdreuYOqkPTcd8kuN5ejcqyatc5QixWTl25j1JwaG4Hdh6+wpOnlpToSpweP7zP7GnjCQ+3obXm9Zq1qVi1BuuWz6PIy6WoXK0mO75dz6mjBzGaTHh7Z6Df8H9+kFjMIez78TvGTJsDQKP32/PZhGGY3NwYNHpqqvYlqhFDB3P40EEeP35EnXdq0Ltvf6xWKwAtW7Wheo2aHPhlHw3r1cHT04tJU/6JVWvNogXzmP75FwA0b9GKUSOGYrNZGT12giO6A8DIYf/06d1aNejd558+tWjVhvv3A2jbqjlPg4NRBgOrV63gf99ux9vbvlNs9pcz6TfgAwDq1W/IoAF9WbPqa/r0G+CwPrnq5y+SEybaxFLxHTuolFoCLNNaH4ilbo3Wum1Cb+BVb6aTHJyYfP5c2MPRIaSIl/N4J9wojXGWQ2OT2993gx0dQoooU8D7P88jePkuSPRWD/m2p1PNW8Q7QtZax3mMTmKSsRBCpDonnBtOLJc67E0IIdLylIUkZCGEa5ERshBCOIe0fEKRJGQhhEuRhCyEEE5CGSQhCyGEU5ARshBCOAlJyEII4SQkIQshhLNIu/lYErIQwrXICFkIIZyEwSBn6gkhhFOQEbIQQjiLtJuPJSELIVyLjJCFEMJJSEIWQggnIadOx+P7mR1T+i1SXc81Rx0dQopY1K6Co0NIdrkzezo6hBSR0UvGUnFJyyPktHt8iBBCxEIplehHItZVVyl1QSl1USk1Mpb6Gkqpo0opq1Lq/RfqOiml/o54dEpM7PI1K4RwKck1QlZKGYE5QB3gJnBIKbVVa302SrPrQGdg6AvLZgXGA5UADRyJWPbFG0ZHIyNkIYRLScYRchXgotb6stY6FFgH+EZtoLW+qrU+CYS/sOx7wE9a64cRSfgnoG5CbygJWQjhWlQSHvHLB9yI8vpmRFli/KtlZcpCCOFSknLqtFLKD/CLUrRQa73weXUsi+jErvrfLCsJWQjhUpIyhxyRfBfGUX0TKBDldX7gdiJXfRN464Vl9ya0kExZCCFcS/JNWRwCiimlCiul3IHWwNZERrETeFcplUUplQV4N6IsXpKQhRAuJbl26mmtrUA/7In0HLBBa31GKTVJKdU44r0qK6VuAi2ABUqpMxHLPgQmY0/qh4BJEWXxkikLIYRLSc4TQ7TW24HtL5SNi/L8EPbpiNiWXQosTcr7SUIWQriUtHymniRkIYRLkWtZCCGEk5ARshBCOAlJyEII4STScD6WhCyEcC0yQhZCCCdhkJ16QgjhHNLwADltnKkXbrPx0aBOzJk8NEbdz9+uZWLftkwZ0IEvxvbngf8dAO7evMbUwV2YMrAjl8+fAsBms/LF2AGEWsypGn9sWlfKx+puFVnVtSITG5XA3RjzU1SrRHbWdKvI6m72NgAFs3qxrFN5vu5SgVfyZgDAqODLVq/iYXLc5gwNtTCiT0cG92jNwK4tWLd8fow2Z04eZWjPtrSoU4Xf9/0cWX7rxlWG9WrH4B6tuXDmJGDfVhOG9cZiDkm1PsRm6sQxNKhdnfYtfWOtX/31Ujq1aUanNs1o39KX6pVfJfDJYx49ekjvru1p39KXX/bsimw/YnA/AgL8Uyv8WPnfu8uwft3o3rYJPdo1ZcuG1XG2vXDuNPWql2f/np8AuHHtKn27tqZXpxacPX0CAJvVyoiBfpgdvK2eMxhUoh/OJk2MkHd/t4HcBXwwP3sao65A4ZcZNWMp7h6e7NvxP7Ysn0v34ZM5sPMbmnbsTbacedjy9Tx6jnyVX3Zs4bW36+Lu4djb+uTwdqdFxXy0XXIYizWcKb4lqV0yJ9tP34tskz+LJx2rFqTnqhMEWaxkSecGQJNyeZi77wp3npjpU7MwH35zjqbl8/LDGX8s1hcvyZp63NzcmfD5fLy80mG1hjFmYDcqVHmDl0u9GtkmR87c9Bs+ka0bV0Zb9sdt/6N99/7kyJ2XVYu/Ynjp6ezcuomatevj4emV2l2Jpn6jJjRv2ZbJ40fFWt+uY1fadewKwIFf9rB+9ddkzJSZjWtXUa+hL7Xeq8+Q/n7UeLsWB37ZQ/ESpciRI2dqdiEGo9GIX/+hFCtekmdPn9KvW2sqVK5KocJForWz2WwsmfsFFatUiyzb/u1GuvYaSO48eVkybxbjps7guy0bqP1eQzwdvK2ec+kRslKqhFKqllLK+4XyBC+2nBwe3ffn9OHfeKNOo1jri5epGJlgXypemkcP7KMPo8lEqMVCqMWM0WjkWXAQpw4doOrb9VIj7AQZDQoPkwGjAk+TgfvBodHqfcvmYdPR2wRZrAA8ehYGgNWm8TAZ8DQZsYZrvD2MvFk0W7Rk7ghKKby80gH2EZPVao1x8ZacufPiU6RYjJ0uJpOJ0FD7tjIZTTwNDuLw77/w1rsNUyv8OJWrUImMmTIlqu3PP2ynznv1AXufLBYLYaGhKGXAarWyYc1K2nbokpLhJkq27DkoVrwkAOnSp6dAoZe4H8uo/dtNa3nzrdpkzpI1suz535XZbMZkMhEcFMgfv+6jdr3Y/z4dITlv4ZTa4h0hK6UGAH2xX1hjiVJqoNb624jqqcAPKRwfGxd/QdNOfbGEPEuw7a8/fUfpilUBqFm/Ocu/mIw1LJS2fUawff1S6rbo7BQbISA4lDUHb7Cl92tYrDYOXnnMwavR7+xSIIt9tLGgXVkMBsWSA9f448ojNh+9zbiGxXEzGpi282+6vlGIFb9fd0Q3YrDZbAzv3Z67t25Q17clL5d8NeGFgLq+Lfhq2njCQkPp+cFoNny9iObtujnFtkosc0gIf/x+gMEjRgNQp24DJowezo7vt9JnwGC2bFxH3QaN8fRyjlHkc3fv3OLS3+cpUTr6trofcI/fftnNtC8X8de5M5HljZu1ZvqUMYSFhjJg+FhWL1tAm049nGpbOVEoSZbQlEUPoKLWOlgp5QNsUkr5aK1nkZiL1/1Hpw79SobMWShUtAR/nYr/Ts9/7v2BaxfPM3jqHACy5sjN4I/sz/3v3OTxw/vkzl+IZTMnYguz0qhdD3LlK5jSXYhVBg8T1Ytlp/n8gwRZrHzkW5L3SuVk59l/Rikmg6JAFi/6rD1JzgwezG9XlnZLDnMvyELftfZ51vyZPcnu7c7VB88Y16A4bkbFwv3XuPHIMXN5RqORzxeu5WlwENPGDeH6lYsULFw0weVy5MrDpBn2S9LeuXWDRw8CyFfAh1kfj8VqDaNN597kLVAopcP/Tw7s30uZsuXJmCkzAN4ZMvDZl/MACAx8wqrlS5j62Rd8MnkcQUGBtGnfmVfKlHNkyIQ8e8bk0UPoNWAY6dNH+wHM/FnT6dZ7EEajMVp5ztx5mD57CQC3bl7nwf0AChQqzKeTPiQsLIxOPfqSv6BPanUhVkm5QL2zSShyo9Y6GOz3jsJ+weV6SqkZxJOQlVJ+SqnDSqnD321Y8a+Du3TuJCcPHmB0j2Ys+WwcF04eYdmMCTHanTt+iB82rqD36Gm4ubnHqN+6cgGN2/mxZ9sGqtR8j4Ztu/H9uiRdhClZVfbJzJ0nZh6HhGEL1+z76z6v5ssYrY1/kIVf/n6ALVxz54mZ6w+eRY6an+tZw4eF+6/SsmI+fjzrz+ID1+j2huMTV3rvDLxSrhLHDv2W5GXXLJ1D6y69+X7LOmrUqkerTj3ZsDKu64c7j107d1A7YrriRcsWzaNTNz9+/mE7xUuW5sNxU5g/+4tUjjA6qzWMyaMH88679Xnzrdox6v86f4aPx4+gY/N67N/7E1999hG//bI7WpvlC76iU4++fLNxDW+/24AO3fuwaumC1OpCnJRK/MPZJDRCvquUKqe1Pg4QMVJuiP2ScnH+Ho16Ff7d5x8k9pYnMTTp2JsmHXsD8Nepo/z0zRq6DJ4Qrc2NyxdYM28a/cfPJGPmrDHW8dfpY2TOlp2ceQsQarGglMJgMDr0SIu7gRZK582Ah8mAxRpOpUJZOHc3KFqbX/5+QJ2IHX2ZvEwUyJqOW4//ibl8gUwEBIdy85EZTzcD4RpsWuPp5pjRwZPHjzCZTKT3zoDFYubkkT9p0jpRdz6PdObEEbJmy0ne/AUJtZhRBvu2spgdf1RMfIKDgjh29BDjpnwSo+7G9WvcDwigfMXK/H3hPB6eHqAUoaGhsawpdWitmfHxBAoUeonmrTvG2ubrTTsin382ZSyvvVGDajXeiSw7eeww2XPkJF+BQljMZgxKYTQYsFgcf6SFM02fJFVCCbkjYI1aEHHR5o5KKYd9FW5bvYiCRUtQ9rXqbF42B0tICIs+HQNAluy56DPm0+exsmPDcroPnwzAm+/5smzGBMJtNtr0Huao8Dl7J4g9F+6zonMFrOGav+4F8+2JO/R4sxDn7gZx4OJD/rjyiCqFs7CmW0XCNczee5lA8z+bonO1goz55hwA35y4w8SGJTAaFNN/vOiQPj16cJ/Zn47HZrOhtaZazdpUer0Ga5fNo2jxUlSuVpOL588wbfxQngYHcvj3/axbsYBZSzcC9m21adUShoyzJ7U6DZrxxdTR2Gw2eg6K/QiH1DD+w6EcO3yIx48f06TeO3Tr2de+wxJo+n4rAPbt+ZkqVd+I3KkZ1cI5s/DrOxCAOnXrM3LIADasXUX3Xv1SrxMvOHPyGLt++I7CRYrRu1NLALr07I//Pfshow2btox3ea01a5YvYvTk6QDU923OtImjsNls9B86OmWDT4Q0nI9RWv/rAWyi/JcRsrMa/e2ZhBulQYvaVXB0CMkud2bHHuKYUoLN1oQbpUE+2T3/czqtOHlPonPOkbFvO1X6ThPHIQshRGKl5RGyJGQhhEtxxjPwEksSshDCpbjyTj0hhEhT0nA+loQshHAtMkIWQggnkYbzsSRkIYRrkZ16QgjhJGTKQgghnIQkZCGEcBJpOB9LQhZCuBYZIQshhJNIw/k4bdzkVAghEis5b3KqlKqrlLqglLqolBoZS72HUmp9RP2fETfyQCnlppRaoZQ6pZQ6p5RK1CULJSELIVyKQalEP+KjlDICc4B6QCmgjVKq1AvNugGPtNZFgZnAtIjyFoCH1vpVoCLQ83myjjf2JPRTCCGcXjLeMaQKcFFrfVlrHQqsA3xfaOMLPL8t0iaglrJPYmsgvVLKBHgBoUBgQm8oCVkI4VKS8a7T+YAbUV7fjCiLtU3EzTueANmwJ+enwB3gOvCZ1vphQm8oCVkI4VIMKvGPqPf/jHj4RVlVbBn7xYvfx9WmCmAD8gKFgSFKqZcSij3Fj7IolTdjwo3SmA/rF3d0CCli2bFbjg4h2bV5JY+jQ0gRbsY0fChBvP77HV6Scup01Pt/xuImUCDK6/zA7Tja3IyYnsgEPATaAj9orcMAf6XUr0Al4HK8sSc6ciGESANUEv4l4BBQTClVWCnlDrQGtr7QZivw/G6+7wO7tf2+eNeBd5RdeqAqcD6hN5TjkIUQLiW5ri2ktbYqpfoBOwEjsFRrfUYpNQk4rLXeCiwBViqlLmIfGbeOWHwOsAw4jX1aY5nW+mRC7ykJWQjhUpLzTD2t9XZg+wtl46I8N2M/xO3F5YJjK0+IJGQhhEtJy2fqSUIWQriUhE74cGaSkIUQLkUuUC+EEE4iDQ+QJSELIVyLTFkIIYSTSLvpWBKyEMLFyAXqhRDCSaThfXqSkIUQrkWOshBCCCchUxZCCOEk0vAAWRKyEMK1yAhZCCGcRNpNx05+PWT/u3cY2KsLHVo0olNLXzatXRmjjdaaWZ9NpW3TenRp05S/zp8F4PrVK/To0JKubZtx+uRxAKxWK4P7dMdsDknVfsQm3GZjxtBuLJk6Itb647/t5tNBHZg+qCOrv5gEgP+t68wc3p3Ph3Th6oXTANhsVhZM/IBQiznVYo+Lp8lAx4p5GfG2D8Pf8qFQlugXGy+SzYspdYsyuEYhBtcoRJ1i2QBI726k3xsFGFrTh1dye0e271I5Lxk9jKnZhRhCQy2MHdCJUb3bMtyvJZtWLoi13R+//MQwv5YM92vJ7E/GAHD7xlVG9+vAqN5t+fus/cqLNpuVqSP7YDE7bnuFhloY1a8jw3q2YXD3lmxYEXufAP745Wda1qnEpQv2v6vzp48z1K81o/p25O4t+92NngYH8dHIftgvA+x4RoNK9MPZOPUI2Wgy0XfQMF4uUYpnT5/So2NLKr1WDZ+XikS2+fO3/dy8fp3V/9vO2dMnmfHJZOYvX8vWLRvx6zeIPHnysWD2TF759Au+3byed+s3wtPTy4G9stu/fRO58hfC/OxpjLqAOzfY/b/V9Jsyl3TeGQh68giAP37aSoN2PcmSMzfbVy3AZ9gUft/5LRVqvou7x3+/08J/1eSVnFwIeMrXR25jVOBmjPl9f+VhCEsORr8zSfl8GTh0I5DjtwLpUbUAp+8GUypXem49sRBosaVW+LFyc3Nn9LR5eHqlw2q1MmlId8pWqkaxkq9Gtrl76zpb1y9nwueLSZ8hI08e22+dtnv7Flp37UeOXHlYt3Q2g0p9ys/fbebNWvXx8HTc9nJzc2f89PmRfRr3QTfKVa7Gy6VejdYu5NlTdnyznmIlXoks+27TaoaM+5SAe7f5cdsmOvb6gM2rFtO0TRenmSpwljj+DaceIWfLnoOXS9jvup0ufXoK+bxEQMC9aG0O7NvDew0ao5Si9KtlCQ4K4sH9AEwmE6FmC2ZzCCaTiaCgQH7bv5f3GjR2RFeiefzAn3NHfqdKrQax1v/583e8Ubcp6bwzAJAhUxYADEYjYaEWwiwWjEYTIU+DOHv4VyrVrJtqscfFw2TgpWxe/Hn9CQA2DWZreKKWtYXbb0lkMhrQWmNQUKNwFvZcSvCekClOKYWnVzoAbFYrNqs1xh/87h3fUKdhC9JnsN+uLFPmrIB9QBFqMWOxmDGaTDwNDuLon/upXjv27Z5aEtMngPXL59O4ZUfc3N0jy4wmE6Ghlsg+3b19k4cP/ClVtmKqxZ+QZLzrdKpLcISslKoCaK31IaVUKaAucD7iws2p5s7tW/x94RylSpeJVn4/4B45c+WOfJ0jZy4C/O/RtEUbpo4fRWhYGENHjWPF4vl06OLnFN+e3y77ioYdemMOeRZrfcBt+0/B2aP7EB4ezrstu1Ci/Gu8UbcZa7/6CJs1jOZ+Q/lp4wpqNe/oFH3Kls6NpxYbrcvlJm9GD24+NvPNGX9CbdF/xhbK4sWQGoV4YrGy7UwA94JDOXYrkHYV8lApfya+PxdANZ/MHL4ZSJjNOX4Ch9tsjO7fgXu3b1KnUQuKRhkxgn2EDDBhcDfCw8Np3r4HZStVo06jFsyfPp6wsDC6DRjFltWLadLaOUaS4TYbI/p04O7tG7zXuAXFSkbv05WL57kfcJeKVauzbeM/U4VNW3dm4cyPcPfwoN+ISaxc+AWtOvVO7fDj5bLXslBKjQfqASal1E/Aa8BeYKRSqrzW+qOUDxGePXvGuBEf0H/wCNJ7e0eri23eSilFrtx5mLVgOQA3b1znQYA/BX0KM2XcSKxhYXTr1Z8ChXxSIfrozh7+De9MWchfpDgXTx+LtU24zcb9OzfpPfFLHj/wZ+7Y/gyduZwsOXLRZ9KXANy/c5PAh/fJma8ga76cgs0aRt3W3cmRt0Cs60xpBgX5Mnmy5bQ/1x+b8S2dk3eKZuWHCw8i29x8YmHKz5cItWlK5ExPl8r5+GTPFczW8MhpDC83A28XzcryQ7doUSYXXm5G9l1+yLVHjptzNRiNfDx3DU+Dg5g5aRg3rl6kgE/RyHqbzca92zcY8+kCHt6/x6Shfkybv47sOXMzZrp9fvbu7Rs8ehhA3gI+zP10HFZrGC069iJP/kIO69P0BfY+fTZhKNevXKRgYXufwsPDWTFvBn2GTYixnE/R4nz01XIAzp48SpZsOdBoZk4ZhdFkomPPQWTOki0VexJTGs7HCU5ZvA/ynz6dAAAgAElEQVS8AdQA+gJNtNaTgPeAVnEtFPXW2iuXLf5PAVqtYYwbMYjadRtQ4506Mepz5MyN/727ka8D/O+RPUfOaG0Wz51Ft1792bx+NXXqNqRLz74sXzTvP8X1b129cIqzh37lo94tWf3FRC6ePsqaWZOjtcmULQelK7+J0WQiW6685MhbgIA7N6O12bF2Ee+16caB7ZupUL0O77Xqyo8blqVmV6J5YrbyxGzl+mN74jx5J4h8maLPk1qs4ZEj5vP+TzEaFOndo++0q/NyNn7++wHl82Xk5hMz60/cpX6JHKnTiQSk985AyTIVOXn492jlWbPnpELVGphMJnLmzkfe/AUjR83PbVg+lxYde7Hz2/W88U5d3u/Qk/+tXpSa4ccqvXcGSpWtyPEofTKHPOPG1UtMHNqTvu0b8fe503w6bnDkjj2wD4T+t3oJzdt1Z9PKRbTs2JMateqxY8s6R3QjGqVUoh/OJqGEbNVa27TWz4BLWutAAK11CBDnBKHWeqHWupLWulKHLt3/dXBaa6ZNHkchn5do1a5TrG3eqPEWO7/fitaaM6dOkN7bm2zZ//kDPn7kENlz5iJ/wUJYzGYMBoXRYMRsccyRFvXb9WTsws2MnreBdoPGU/SVCrQdODZam1eqVOfi6aMAPA18TMCdG2TLlTey/tKZ42TKmoMceQoQajHbP1wGA2GhllTtS1RBFhuPQ8LIkd4NgGLZ03EvKDRamwxRjpgokNkTpeBp6D877bKndyOTh4nLD0JwNyq0BrTG5MC94YGPH/E0OAiAUIuZM8cOkqeAT7Q2larV5NzJIwAEPXnMnZvXyZknX2T9uZNHyJotJ7nzFcRiMaOUAYPBQKjFMdvrxT6dOnqQfFH6lC69N0s272LOqm3MWbWNYiVfYfikGRQpXiqyzb4fv6PCa2/inSGjvU8GhVIGLE5wtI9RqUQ/nE1Cc8ihSql0EQk5ctZeKZWJeBJycjl14hg/bt/GS0WL0a1tcwB69B3Ivbt3APBt3oqqb9Tgj1/307ZpPTw8vRg57p/Rptaar5cuYOLHnwPQsOn7TBk7EpvNyuCRY2O+oQP9sG4JBYoUp3TlNylergp/nTjEp4M6YDAYaNihD+kzZALsffp589d0GDwRgKp1GrFm1mTCbTaa+Q1xZBfYctqfdhXyYjQoHj4LZd3xu7xeyB7379eeUCZPBqr5ZCY8XBMWrll15Ha05euVyM6O8/cBOHYriC6V81H9pSz8cOF+qvfluccP7zP/8wmE28LROpzXatSmwmvV2fT1fAoXK0nF12tSpuLrnDryJ8P8WmIwGGjbfSAZMmYG7Nvrm7VLGfDhxwC8U68pc6eNxWaz0aX/SIf06dHD+8z5dDzh4fY+vV6jDhWrVmf98vkUebkklarVjHd5i9nMvp++Y/QncwBo2Lwdn08cjsnNjYEfpsosZryc8Gi2RFPxHTuolPLQWsf4GldKZQfyaK1PJfQGdwPDnGPPTDI6dM3xe/9Twt6rjx0dQrJr80oeR4eQItyMaTjrxKNswQz/uWODt55PdM6Z0biEU/2PjHeEHFsyjii/Dzhu2CKEEHFwxrnhxHLqE0OEECKp0vKUhSRkIYRLScMDZEnIQgjXYkrDGVkSshDCpaThfCwJWQjhWlz21GkhhEhr0nA+du6rvQkhRFIZVOIfCVFK1VVKXVBKXVRKxTiTRynloZRaH1H/p1LKJ0pdGaXU70qpM0qpU0qpBK+5KiNkIYRLSa4LzyuljMAcoA5wEziklNqqtT4bpVk34JHWuqhSqjUwDWillDIBq4AOWusTSqlsQFhC7ykjZCGES0nGEXIV4KLW+rLWOhRYB/i+0MYXWBHxfBNQS9nPTHkXOKm1PgGgtX6gtU7wbguSkIUQLkUl5V+UK1NGPPyirCofcCPK65sRZcTWRmttBZ4A2YCXAa2U2qmUOqqUGp6Y2GXKQgjhUpIyY6G1XggsjKM6tjW9eJ2MuNqYgDeBysAzYJdS6ojWeld88cgIWQjhUpJxyuImEPWOD/mB23G1iZg3zgQ8jCjfp7W+H3G1zO1AhQRjT0wHhRAirUjGC9QfAooppQorpdyB1sDWF9psBZ5frP19YLe2X0JzJ1BGKZUuIlHXBM6SAJmyEEK4lFhudv6vaK2tSql+2JOrEViqtT6jlJoEHNZabwWWACuVUhexj4xbRyz7SCk1A3tS18B2rfX3Cb2nJGQhhEtJzjP1Im7mvP2FsnFRnpuBFnEsuwr7oW+JJglZCOFS5PKb8Uj3wk0sXUGlglkcHUKKyOTh5ugQkt2ea655HwX/4ATPMUiTyhYs/p/XkZZPnZYRshDCpRhiPRItbZCELIRwKTJCFkIIJ2FKw5PIkpCFEC5FRshCCOEk5AL1QgjhJNJwPpaELIRwLWn5ehCSkIUQLkWmLIQQwklIQhZCCCeRdtOxJGQhhItJwwNkSchCCNeSiOscOy1JyEIIlyJHWQghhJOQnXpCCOEkZMpCCCGchExZCCGEk0jLI2Sn/jKZNG407771Bq2aNYq33ZnTp3itfGl2/bQTgKtXr9ChdXPatmjCyRPHALBarfTx64I5JCTF406I/727DOrdlY4tG9O5VRM2rYt5261rVy/Tp2s76rxRgXWrlkeWP370kH49OtK5dVP2790VWT56aH/uB/inRvjxCrfZmDSwI19OHBKj7q/Tx5g8sBM9fd/kyK+7I8vv3rzG5EGdmdi/A5fOnwLAZrMyY0x/LGZzqsUel3UfdmLzxN78b3JfvvloQIx6y9Mgfpo3ic2TevPtxwN5eOsqACFBj9n26RA2T+zF1eO/Rbb/ce5Enj5+kFrhxyosJJhDKz5h9ye92T2tDw+vno9Wf//iKbaPbs3ezwey9/OBXPhxHQCW4Ccc+GoEe6b3486pPyLbH1w6BfMTx/bpOZWEh7Nx6hFyQ98mtGzTlvGjR8bZxmazMfuLz6la7Y3Isi0b19Nv4GDy5M3H7Fkz+HRGeTZvWEf9hr54enmlRujxMhqN9Bk4lJdLlOLZ06f4dWxFpSqv4/NSkcg2GTNmYsDQURzYuzvasrt+3EHdBo15p049hg/sRfW3avHb/r0UK16K7DlypnZXYvh52wby5Pch5NnTGHVZc+Smy6Cx7NyyOlr5vh++oXmnPmTLlYf/LZ9L7w8/Zu/2LVR9uy4enp6pFXq8Ggz5BE/vTLHWHd+xnmz5i1Cn9zge373Bb2vmUH/wJ1w6tI9ir9fmpco1+eHLsfiUq8a1E3+QvUBR0mfOlso9iO7UN4vIWbwClTuNJNwahi3MEqNNtsKleK37uGhlt479QoHK75CvXHV+XzSBPK9W5e6Zg2TKVwTPTI7t03PG/08jZKXU1ykRSGwqVKxMxoyZ422zfu0q3q5dhyxZ//kwmEwmzBYLZrMZk8lEUGAg+/ftoUEj35QOOVGyZc/ByyVKAZAufXoKFS7M/YB70dpkyZqNEqVewWiK/p1pMpqwWCyEhoWilAGr1cqmtato3aFzaoUfp4f3/Tl16FfefLdxrPXZc+Uhf+GiKBX9Y2c0mQgNtRBqMWM0mXgWHMTJgwd4/Z36qRH2f/b4znXyliwLQObcBQh6cI9ngY8wGI1Yw0IJt4ahlCLcZuPMrm8o815zh8YbZn7Gw8tnKPhaHQAMJjfcvLwTtawyGrGFhWKzWlHKQLjNxuVftlLk7WYpGXKSKJX4h7OJd4SslNr6YhHwtlIqM4DWOva/vFTif+8ee3f/zLxFyzl7ZkxkeYvWbRk/ZiRhoaGMGjuRxQvm0qVHT6ecW7pz+xZ/XzhPydJlEtW+Vt36TBk7gp3fb6Vnvw/4dvN63q3fCE9Px4/81y/6gve79MMc8ixJy71dvzlLZ07CGhZGh74j2LZuKfVbdnKi7aXY8cVoUIqS1etRokb0L4qs+V/i6tHfyF30FfyvXCD4oT/PHt2naJW32bN4Ghd/30XlZl04u+87ir5eC5O7Y0f9zx7cxT19Jo6vm8WT21fInL8orzTpgckjelwPr11g72cD8MyUlVKNupIxd0Hyl6/JkdWfc+PwHko16MTV37aTv9LbmNw9HNSbmJRTTkYkTkJTFvmBs8BiQGNPyJWAz1M4rkSZMf1j+g8agtEY/c7WufPkZcES+0D+xvVrBAT441P4JcZ9OJywsDB69R1AIZ/Cjgg5mmfPnjF+5Af0GzyC9N6JG6F4e2fgk5lzAQgKfMLalUuZNO0Lpn80gaCgQFq17UjpMuVSMuxYnTh4gIyZslCoaAkunDqapGWz5czNsI/tffK/fYMnD++TJ78PSz6fiNUahm97P3LnK5gSYSdKo+Gfkz5zNkICH7Nj1odkyl2APC+/Gllftm4Lfl+/gP9N7kvWfD5kK1AEZTDi7pWe9/pPAuzzzCd3bqR2r7HsXzkLy7MgXq3dnFxFSqZ6f3S4jSe3LvFqUz+yFCrOqW8WcXH3JkrUax/ZJlP+ItQZsxiThxf3zh3m0LKPqDVqAW5e6akaMY0R+iyYi3s2U7nzKI5vmE1YSDBFajYhq0+JVO9TVE7zPf4vJDRlUQk4AowGnmit9wIhWut9Wut9cS2klPJTSh1WSh1etmRh8kX7gnNnTjN6xBAa16vF7p9+ZNpHk9i7++dobeZ+9QW9+g5g/ZpV1G3QiJ59+rNowdwUiymxrNYwxo/4gNrvNaDG27X/1TpWLJ5P+y492P3jdl4uUYoRYyaxaN6XyRxp4lw6d5LjB/czsltTFn46lgsnj7D48wlJXs+WlQvwbe/Hrm0beO2td2nctjvb1i5J/oCT4Pl8r1fGzBQqV42Aqxei1bt7padm58E0GzuHml2GYg5+QobsuaK1Ofb9GsrVb82lQ3vJXrAoNTp+wOFvlqdWF6LxzJQdz0zZyVKoOAB5y1Tj8a3L0dq4eabD5GH/1ZWrZCXCbTYswYHR2vz10zqK1W7JrWO/kDl/Ecq1GsC57StTpxPxMKAS/XA28Y6QtdbhwEyl1MaI/95LaJmI5RYCCwECzeE6OQKNzbc7/km+E8aOonqNt3jrnX+S25HDB8mZMxcFC/lgNodgUAYMBgMWBx9pobXm08njKVj4JVq26/Sv1nHz+jUe3A+gXIXKXPzrAh4eHiilCLXE3DmTGpp16kOzTn0AuHDqKDv/t5ruQyYkaR0XTh0lc7Yc5MpbgFCLGaUMGIxGQi2OO9IizGJG63DcPdMRZjFz6+xRyjdoG62N5VkwJncPjCY3Lhz4gdzFXsXdK31k/ZN7t3j6+CF5Xi7DgxuXMbm5g1LYwkJTuzsAeGbMglfm7AT738Q7Z34C/j5BhlwForUxBz7CI0NmlFI8uv4X6HDc02eIrA8OuI35yUOyF3mFwFuXMbi5o4Bwq2P6FFVaHiEn6igLrfVNoIVSqgEQmFD75DJ6xBCOHD7I48ePaVDnLfx698NqtQLQvGXreJfVWrN00Xw+nj4TgKbNWzJ21DBsNhsjR49P8djjc+rEMX7csY2XihajW7v3AejRZwD37t4FwLd5Sx7cv0/Pzq149vQpShnYtG4lK9Z9Gzm1sXjel3TvbT8Eq9a79RgzbCCb162mS8++julUHL5dtZBCxUpS7rXqXPnrLHOnjrTvtDt0gG9XL2bS3DWAfXt9v2E5PUdMAaBG3SYs/mw8tnAb7XsPc1j8IYGP+Hn+ZMB+SF+RKm9R4JVKnNv3PQAlazbg8Z0b7Fv+GUoZyJynIDU6Doq2jsPfrqCSr/2Lt0jlt/hp3iRO7/6Wio07pG5noni1qR9HVs8g3BZG+qy5Kdd6IFd/2wGAT7V63Dn5K1d/24EyGDG6uVOx/bBoc/rnd6ykRD17/PnK1+Dgsqlc2b+N4nXbOaQ/UaXlU6eV1ik2gAVSdoTsKE8tVkeHkCL+vhfs6BCS3Z+3Hzs6hBThHxzm6BBSxPSGxf9zNt11/n6ic06tEtmdKns79XHIQgiRVGn5KAunPlNPCCGSKjmPQ1ZK1VVKXVBKXVRKxThDTSnloZRaH1H/p1LK54X6gkqpYKXU0MTELglZCOFSVBL+xbsepYzAHKAeUApoo5Qq9UKzbsAjrXVRYCYw7YX6mcCOxMYuCVkI4VIMKvGPBFQBLmqtL2utQ4F1wIun+/oCKyKebwJqqYi9n0qpJsBl4EyiY09sQyGESAsMSiX6EfWciYiHX5RV5QNuRHl9M6KM2Npora3AEyCbUio9MAKYmJTYZaeeEMKlJGWXXtRzJhK5qheP4IirzURgptY6OCmXAJCELIRwKcl4HPJNIOoZM/mB23G0uamUMgGZgIfAa8D7SqlPgcxAuFLKrLWeHd8bSkIWQriUZDzo7RBQTClVGLgFtAbavtBmK9AJ+B14H9it7Sd3VI+MR6kJQHBCyRgkIQshXE0yZWSttVUp1Q/YCRiBpVrrM0qpScBhrfVWYAmwUil1EfvIOP5TiBMgCVkI4VKS89RprfV2YPsLZeOiPDcDLRJYx4TEvp8kZCGES0m75+lJQhZCuJo0nJElIQshXEpavpaFJGQhhEtJw1fflIQshHAtaTgfS0IWQrgW57k5btJJQhZCuJQ0nI9TPiHbXO+GIWRJ7+7oEFJE2QKZHR1CssuSzjW3VZVGMS7N6xKmN0zwZLYEpeF8LCNkIYSLScMZWRKyEMKlyGFvQgjhJGQOWQghnIQkZCGEcBIyZSGEEE5CRshCCOEk0nA+loQshHAxaTgjS0IWQriU5LxAfWqThCyEcClpNx1LQhZCuJo0nJElIQshXIoc9iaEEE4iDU8hS0IWQriWNJyPJSELIVyLXKBeCCGcRBrOxxgcHUB8pkwYTb133qTt+41jrT9y+CC1qlehQ6umdGjVlCUL5gLw6OFD/Lq0p+37jdm35+fI9sMG9SXA3z9VYo/PuDGjeKv66zTzbRhrvdaaT6ZOoWHdOrzftBHnzp4B4OqVy7Ru0YwWTRtz4vgxAKxWK37dOhMSEpJa4cdp8vjRvPf2G7Ru3ijW+iOHDvL2m5Vp17Ip7Vo2ZfGCOYB9e/Xo3I7WzRuxd/c/22uoE2yv0FALI/t0ZEiP1gzq2oL1y+fHaLNt4yoGdXmfwd1bMWFoLwLu3QHg1o2rDO/VjiE9WnPhzEkAbDYrE4f1xmJO/e01f3w7ru36mMMbP4wsmzqoCcf/N4aD60ex/vMeZPL2inXZ899P5NCGD/lj3UgOrB4eWT5lgC8H149i8eQOkWVtGlSmb5u3UqwfCVFJeDgbp07IDRo1ZeachfG2KVe+IivXb2Hl+i1069kHgB9/+J76jXxZtGItq1YsA2D/vj0UL1mKHDlzpnjcCfFt0ox5CxbHWX9g/y9cv3aVbTt+ZNyEyUyZNAGAjRvWM/CDIXz2xZesWLYUgA3r19KwkS9eXrH/IaWmBo2bMGtuwttr9YYtrN6whe49+wL27dWgkS9Lvl7HqhX2fu3ft4fiJRy/vdzc3Bn/+Xw+X7SOzxau4dih3/jr7KlobQoXLc60eSuZsXg9r9eoxcqFswD4adv/aNe9P0PHf8rWjSsB2Ll1EzVr18fDM/W318ptf+Dbd060sl1/nKdii6lUafUxf1/zZ1jXd+Ncvq7fLKq2/oQ3230KQEZvT6qWLUyVVh9jNBgoXTQvnh5udGhUlQUbf0nRvsQrDWdkp07I5StWImOmTElezmRyw2I2ExYaisGgsFqtrF/zNe07dk2BKJOuYqXK8fZrz+5dNGrcBKUUZcqWIygokIAAf0wmExazGXOIGZObicDAQPbt3UMj3yapGH3cKlSsTMaMSb8NlNFkwmKxEBYaijIYsFqtrF39NR06OX57KaXw8koHgM1qxWa1xvhDfqV85cgEW6zkqzwIsI/qjSYToaEWLBYzRqOJp8FBHP79F2q+G/svo5T269FLPHzyLFrZrj/OY7OFA3Dw1BXy5Ur89gsP17i72Wc9vTzcCLPa+KBTLeau24vVGp58gSeRSsI/Z5OkhKyUelMpNVgpFffXaCo7dfI47Vs2ZVBfPy5f+huA9+o14M/ff2VQXz+69+zL5g1rqdfAF08nGEUmhr//PXLlzh35Oleu3Pjfu0frNu1Y+fVypkwaT/cePVkwbw49/HqlqZ0Yp04ep23LJgzs68eli/btVbdeQ/747QAD+vagRy/79qrf0Hm2l81mY6hfG7o1r0OZilV5ueSrcbbdveNbylepBkBd3xZs27SahTOn0qxtVzZ+vYjm7bo57fbq6Ps6O389G2ud1pptc/vx6+rhdG32BgDBzyx8s+s4f6wbydXbDwgMDqFiqUJ8t/dUrOtILUol/uFs4t2pp5Q6qLWuEvG8B9AX2AKMV0pV0Fp/kgoxxqlEiVJ8s/1n0qVLz2/79zH8g/5s2voD3hkyMOMr+1xfYOATVi5fwiefz2LqpHEEBQbStkNnXi1bzpGhx0/HvDGsUoo8efOyZLn9p+/1a9cICPCncOGX+HDkMMLCwujbfyA+PoVTO9pEK16yFFt37CJduvT8un8fwz/ox+ZtO/HOkIGZsxcAEdtr2WKmzfiSjyaOJSjIvr3KlC3vsLiNRiOfLVzL0+AgPh03hOtXLlKwcNEY7X75aTuX/jrLpBmLAMiRKw+TZtincO7cusHDBwHkK+DDlx+PxWoNo3Xn3uQtUChV+xKX4d3ew2YLZ932Q7HWv9NlJncCnpAjizffze/Hhat3+fXoJWas+JkZK+zz/nPHtWXyvO/p3PR1alctyam/bzFt8c7U7AYABidMtImV0AjZLcpzP6CO1noi8C7QLq6FlFJ+SqnDSqnDy5cuSoYwY5fe25t06dIDUK16TaxWK48fPYrWZunCeXTu1pMff9hOiZKlGDNhCvNmf5FiMSWHnLlyc+/u3cjX9+7djTGX+tWXM+nbfyBrVq+kQYNG9OnbnwVz//sde1OSd5Tt9UYc22vxgrl06d6TH3d8T4lSpRkz4SPmfeUc2yu9dwZKl6vEsUO/xag7eeRPNq9ZwsjJM3Fzj3mn67VL59CmS2+2b1lH9Vr1aNWpJxtWxj/fnlraNXqN+jVeofPo5XG2uRPwBICAR8Fs3X2SyqV9otWXLZ4fgL+v+dOu4Wu0H7GU0kXzUqRgjpQKOx7JN4mslKqrlLqglLqolIpxq2+llIdSan1E/Z9KKZ+I8jpKqSNKqVMR/30nMZEnlJANSqksSqlsgNJaBwBorZ8C1rgW0lov1FpX0lpX6ty1R2Li+Fce3A9AR4wmz5w+idbhZMr8zxzY9WtXCQjwp0KlyljMZgwGAyhFqMWSYjElh7fefodtW79Ba83JE8fx9s5Ajhz/JOTDhw6SM2cuChXywRwSgjIYMBiNhJjNDow6Yfejbq9TJwnXOsb2uh/gT4VKVTCbzRiUAaUUllDHba8njx/xNDgIAIvFzMkjf5KvgE+0Npf/Ps+CmR8xcvJMMmXJGmMdZ04cIWu2nOTJXxCLxYwyKAwGI6FOsL3qVCvJkM61eX/QAkLMYbG2Sefpjnc6j8jntV8vwZlLt6O1GdenIZPnfY+byYgxYogaHq5J5xnzyymlJdeUhVLKCMwB6gGlgDZKqVIvNOsGPNJaFwVmAtMiyu8DjbTWrwKdgJWJiT2h45AzAUewf5VopVRurfVdpZQ3qbCPcuzIoRw9cpDHjx/T6L236dGrH1ar/UPTrEVrdv/8I//buA6j0YSHpweTP/482vzcgjmz6Nl3IAB16tZnxAf9Wb9mJT1690/p0OM1YuhgDh86yOPHj6jzTg169+2P1Wr/fmvZqg3Va9TkwC/7aFivDp6eXkyaMjVyWa01ixbMY/rn9lFj8xatGDViKDabldFjJziiO5HGjBzCkcP27dXw3bfo0btfZL+aR2yvzRvWYjSZ8PTw4KNPom+vebNn0buffXu9W68Bwwb1Y92ar+nZZ4BD+gPw6MF9Zn86nnCbDa011WrWptLrNVi3bB5FipeicrWarFw4C3NICJ9PGgFA9py5GTllJmDfXptXLWHwOPvsXp0GzZg1dTQ2mw2/QaNStS8rPu5M9YrFyJ7Zm4s/TGby/O0M6/IuHu4mvpvXD4CDp64y4KN15MmRibnj2tK0/zxyZsvA+hn2gZXJaGT9jsP89Nu5yPU2eqsMR85cixxF/3nyKoc2fMjpv29x6q9bqdpHSNbEVAW4qLW+DKCUWgf4AlEn2n2BCRHPNwGzlVJKa30sSpszgKdSykNrHe/oQulY5isTopRKB+TSWl9JqO2jZ7akv4GT83I3OjqEFGEJc9ye8ZRy/cGzhBulQVUaxfj17BJCjs3+z/n0zpPQROecvJk9emKfjn1uodZ6IYBS6n2grta6e8TrDsBrWut+zxsrpU5HtLkZ8fpSRJv7Udq8D/TSWtdOKJ5/daae1voZkGAyFkKI1JaUo1gikm9ck/mxrejFZB9vG6VUaezTGIk6Ms2pj0MWQoikSsbzQm4CBaK8zg/cjquNUsqEfZr3YcTr/NiPSuuotb6UmNglIQshXEoyHod8CCimlCqslHIHWgNbX2izFftOO4D3gd1aa62Uygx8D4zSWv+a2NglIQshXEpynamntbYC/YCdwDlgg9b6jFJqklLq+QV2lgDZlFIXgcHA88n9fkBRYKxS6njEI8HrAPyrnXpJITv10g7ZqZd2yE69uAUEWxOdc3J4m5zqNBK5/KYQwqU4VYZNIknIQgiXYnDGi1QkkiRkIYRLScP5WHbqCSGEs5ARshDCpaTlEbIkZCGES3HGC88nliRkIYRLkRGyEEI4CUnIQgjhJGTKQgghnISMkIUQwkmk4XwsCVkI4WLScEaWhCyEcClp+dTpFL/aW2pSSvk9v/2KK3HFfrlin8A1++WKfXJWrnbqtF/CTdIkV+yXK/YJXLNfrtgnp+RqCVkIIdIsSchCCOEkXC0hu+o8lyv2yxX7BK7ZL1fsk1NyqZ16QgiRlrnaCFkIIdIsl0jISqmlSil/pdRpR8eSXJRSBePgQ+sAAAJGSURBVJRSe5RS55RSZ5RSAx0dU3JQSnkqpQ4qpU5E9Guio2NKLkopo1LqmFLqO0fHklyUUleVUqci7pp82NHxuDqXmLJQStUAgoH/a9/uWZyI4iiMP6fYQqJiIxJMsZ2NhcqyTToR8WURSwutLC0UC8EvIX4AbUTRZrUSREFFFtYXsqwWrqWFKAQR0W31WGS+QSbcyfX8YMike9L8uXPn5o7tg6V72iCpD/Rtb0jaBYyAs7Y/Fk6biiQBPdvbkhaANeCy7deF06Ym6SqwBOy2vVK6pw2SPgNLtr+XbvkfVLFCtv0K+FG6o022v9neaO5/A1vA/rJV0/PEdvN1obnmflUgaQCcBm6Vbon5VcVArp2kReAw8KZsSTuaR/tNYAw8s13D77oJXAP+lg5pmYGnkkaS8geRGctA7jhJO4FV4IrtX6V72mD7j+1DwABYljTX20ySVoCx7VHplhkY2j4CnAQuNduDMSMZyB3W7LGuAvdsPyzd0zbbP4GXwInCKdMaAmea/dYHwFFJd8smtcP21+ZzDDwClssW1S0DuaOal1+3gS3bN0r3tEXSXkl7mvsdwDHgU9mq6di+bntgexE4Bzy3fb5w1tQk9ZoXykjqAceBak4ydVEVA1nSfWAdOCDpi6SLpZtaMAQuMFltbTbXqdJRLegDLyR9AN4x2UOu5phYZfYBa5LeA2+Bx7afFG6qWhXH3iIialDFCjkiogYZyBERHZGBHBHRERnIEREdkYEcEdERGcgRER2RgRwR0REZyBERHfEPDkQ0fjPnGGcAAAAASUVORK5CYII=\n",
      "text/plain": [
       "<Figure size 432x288 with 2 Axes>"
      ]
     },
     "metadata": {
      "needs_background": "light"
     },
     "output_type": "display_data"
    }
   ],
   "source": [
    "sns.heatmap(pd.DataFrame(cm/np.sum(cm), \n",
    "                         index=stars, \n",
    "                         columns=stars), \n",
    "            annot=True, \n",
    "            cmap='Blues', \n",
    "            fmt='.1%')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 81,
   "metadata": {
    "ExecuteTime": {
     "end_time": "2018-12-28T02:45:12.284710Z",
     "start_time": "2018-12-28T02:45:12.277127Z"
    }
   },
   "outputs": [
    {
     "data": {
      "text/plain": [
       "0.44955063467061984"
      ]
     },
     "execution_count": 81,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "accuracy_score(y_true=y_test, y_pred=y_pred)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 36,
   "metadata": {
    "ExecuteTime": {
     "end_time": "2018-12-28T02:30:21.727266Z",
     "start_time": "2018-12-28T02:30:21.428123Z"
    }
   },
   "outputs": [
    {
     "data": {
      "text/plain": [
       "0.8614708105573701"
      ]
     },
     "execution_count": 36,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "roc_auc_score(y_score=lgb_model.predict(X_test), y_true=y_test)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 55,
   "metadata": {
    "ExecuteTime": {
     "end_time": "2018-12-27T23:56:19.836967Z",
     "start_time": "2018-12-27T23:56:19.660296Z"
    }
   },
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>0</th>\n",
       "      <th>1</th>\n",
       "      <th>2</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>count</th>\n",
       "      <td>139717.000000</td>\n",
       "      <td>139717.000000</td>\n",
       "      <td>139717.000000</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>mean</th>\n",
       "      <td>0.630986</td>\n",
       "      <td>0.209083</td>\n",
       "      <td>0.159931</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>std</th>\n",
       "      <td>0.007147</td>\n",
       "      <td>0.005648</td>\n",
       "      <td>0.004706</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>min</th>\n",
       "      <td>0.502827</td>\n",
       "      <td>0.152388</td>\n",
       "      <td>0.110754</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>25%</th>\n",
       "      <td>0.629275</td>\n",
       "      <td>0.206945</td>\n",
       "      <td>0.158686</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>50%</th>\n",
       "      <td>0.630822</td>\n",
       "      <td>0.208772</td>\n",
       "      <td>0.160465</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>75%</th>\n",
       "      <td>0.632655</td>\n",
       "      <td>0.210202</td>\n",
       "      <td>0.161593</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>max</th>\n",
       "      <td>0.700247</td>\n",
       "      <td>0.337525</td>\n",
       "      <td>0.262446</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "                   0              1              2\n",
       "count  139717.000000  139717.000000  139717.000000\n",
       "mean        0.630986       0.209083       0.159931\n",
       "std         0.007147       0.005648       0.004706\n",
       "min         0.502827       0.152388       0.110754\n",
       "25%         0.629275       0.206945       0.158686\n",
       "50%         0.630822       0.208772       0.160465\n",
       "75%         0.632655       0.210202       0.161593\n",
       "max         0.700247       0.337525       0.262446"
      ]
     },
     "execution_count": 55,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "pd.DataFrame(lgb_model.predict(X_test)).describe()"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Random Forest"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 28,
   "metadata": {
    "ExecuteTime": {
     "end_time": "2018-12-27T23:40:48.817474Z",
     "start_time": "2018-12-27T23:35:53.942018Z"
    }
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Accuracy: 63.39%\n"
     ]
    }
   ],
   "source": [
    "rf = RandomForestClassifier(n_jobs=-1,  \n",
    "                            n_estimators=100,\n",
    "                            class_weight='balanced_subsample')\n",
    "rf.fit(X_train, y_train)\n",
    "y_pred = rf.predict(X_test)\n",
    "print(f'Accuracy: {accuracy_score(y_true=y_test, y_pred=y_pred):.2%}')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 38,
   "metadata": {
    "ExecuteTime": {
     "end_time": "2018-12-27T23:50:06.311958Z",
     "start_time": "2018-12-27T23:50:04.657072Z"
    }
   },
   "outputs": [],
   "source": [
    "y_pred_prob = rf.predict_proba(X_test)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 39,
   "metadata": {
    "ExecuteTime": {
     "end_time": "2018-12-27T23:50:16.664162Z",
     "start_time": "2018-12-27T23:50:16.598883Z"
    }
   },
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>0</th>\n",
       "      <th>1</th>\n",
       "      <th>2</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>count</th>\n",
       "      <td>139717.000000</td>\n",
       "      <td>139717.000000</td>\n",
       "      <td>139717.000000</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>mean</th>\n",
       "      <td>0.635541</td>\n",
       "      <td>0.207648</td>\n",
       "      <td>0.156812</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>std</th>\n",
       "      <td>0.048588</td>\n",
       "      <td>0.041110</td>\n",
       "      <td>0.036620</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>min</th>\n",
       "      <td>0.190000</td>\n",
       "      <td>0.050000</td>\n",
       "      <td>0.030000</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>25%</th>\n",
       "      <td>0.600000</td>\n",
       "      <td>0.180000</td>\n",
       "      <td>0.130000</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>50%</th>\n",
       "      <td>0.640000</td>\n",
       "      <td>0.210000</td>\n",
       "      <td>0.160000</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>75%</th>\n",
       "      <td>0.670000</td>\n",
       "      <td>0.230000</td>\n",
       "      <td>0.180000</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>max</th>\n",
       "      <td>0.860000</td>\n",
       "      <td>0.740000</td>\n",
       "      <td>0.340000</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "                   0              1              2\n",
       "count  139717.000000  139717.000000  139717.000000\n",
       "mean        0.635541       0.207648       0.156812\n",
       "std         0.048588       0.041110       0.036620\n",
       "min         0.190000       0.050000       0.030000\n",
       "25%         0.600000       0.180000       0.130000\n",
       "50%         0.640000       0.210000       0.160000\n",
       "75%         0.670000       0.230000       0.180000\n",
       "max         0.860000       0.740000       0.340000"
      ]
     },
     "execution_count": 39,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "pd.DataFrame(y_pred_prob).describe()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 36,
   "metadata": {
    "ExecuteTime": {
     "end_time": "2018-12-27T23:49:06.348646Z",
     "start_time": "2018-12-27T23:49:06.325988Z"
    }
   },
   "outputs": [
    {
     "data": {
      "text/plain": [
       "0    139715\n",
       "1         2\n",
       "dtype: int64"
      ]
     },
     "execution_count": 36,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "pd.Series(y_pred).value_counts()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 32,
   "metadata": {
    "ExecuteTime": {
     "end_time": "2018-12-27T23:47:01.765414Z",
     "start_time": "2018-12-27T23:47:01.758140Z"
    }
   },
   "outputs": [
    {
     "data": {
      "text/plain": [
       "0    354263\n",
       "1    116809\n",
       "2     87796\n",
       "dtype: int64"
      ]
     },
     "execution_count": 32,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "pd.Series(y_train).value_counts()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 33,
   "metadata": {
    "ExecuteTime": {
     "end_time": "2018-12-27T23:47:24.456819Z",
     "start_time": "2018-12-27T23:47:24.444497Z"
    }
   },
   "outputs": [
    {
     "data": {
      "text/plain": [
       "0.6338956605137528"
      ]
     },
     "execution_count": 33,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "(y_test == 0).mean()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 29,
   "metadata": {
    "ExecuteTime": {
     "end_time": "2018-12-27T23:40:48.919212Z",
     "start_time": "2018-12-27T23:40:48.819056Z"
    }
   },
   "outputs": [
    {
     "data": {
      "text/plain": [
       "array([[88564,     2,     0],\n",
       "       [29202,     0,     0],\n",
       "       [21949,     0,     0]])"
      ]
     },
     "execution_count": 29,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "confusion_matrix(y_true=y_test, y_pred=y_pred)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Logistic Regression"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### Binary Classification"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 44,
   "metadata": {
    "ExecuteTime": {
     "end_time": "2018-12-28T01:29:09.839140Z",
     "start_time": "2018-12-28T01:29:04.044264Z"
    }
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Accuracy: 50.05%\n"
     ]
    }
   ],
   "source": [
    "lr = LogisticRegression()\n",
    "lr.fit(X_train, y_train)\n",
    "y_pred = lr.predict(X_test)\n",
    "print(f'Accuracy: {accuracy_score(y_true=y_test, y_pred=y_pred):.2%}')"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### Multinomial Classification"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 100,
   "metadata": {
    "ExecuteTime": {
     "end_time": "2018-12-28T04:17:30.953069Z",
     "start_time": "2018-12-28T04:17:16.299083Z"
    }
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Accuracy: 34.60%\n"
     ]
    }
   ],
   "source": [
    "lr = LogisticRegression(multi_class='multinomial', solver='lbfgs', class_weight='balanced')\n",
    "lr.fit(X_train, y_train)\n",
    "y_pred = lr.predict(X_test)\n",
    "print(f'Accuracy: {accuracy_score(y_true=y_test, y_pred=y_pred):.2%}')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 101,
   "metadata": {
    "ExecuteTime": {
     "end_time": "2018-12-28T04:18:01.235296Z",
     "start_time": "2018-12-28T04:18:01.179436Z"
    }
   },
   "outputs": [
    {
     "data": {
      "text/plain": [
       "array([[ 7624,  3105,  1792,  2075,  4929],\n",
       "       [ 4957,  5365,  3126,  2027,  4111],\n",
       "       [ 3142,  3631,  4546,  3171,  5017],\n",
       "       [ 2301,  1756,  3307,  4580,  7430],\n",
       "       [ 1742,   888,  1363,  3653, 11499]])"
      ]
     },
     "execution_count": 101,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "confusion_matrix(y_true=y_test, y_pred=y_pred)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": []
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.6.8"
  },
  "toc": {
   "base_numbering": 1,
   "nav_menu": {},
   "number_sections": true,
   "sideBar": true,
   "skip_h1_title": false,
   "title_cell": "Table of Contents",
   "title_sidebar": "Contents",
   "toc_cell": false,
   "toc_position": {},
   "toc_section_display": true,
   "toc_window_display": false
  }
 },
 "nbformat": 4,
 "nbformat_minor": 2
}
